From 761fa5844e692dbc7e0dcf8e30e80ef7ba38a317 Mon Sep 17 00:00:00 2001 From: c8ef Date: Fri, 18 Oct 2024 14:20:34 +0800 Subject: [PATCH 001/511] [TLI] Add support for the `ilogb` libcall. (#112725) This patch adds the `ilogb` libcall. Constant folding will be handled in subsequent patches. --- .../math-libcalls-tbaa-indirect-args.c | 18 ++++++++--------- .../llvm/Analysis/TargetLibraryInfo.def | 15 ++++++++++++++ llvm/lib/Analysis/TargetLibraryInfo.cpp | 3 +++ llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 3 +++ .../Transforms/InferFunctionAttrs/annotate.ll | 9 +++++++++ .../tools/llvm-tli-checker/ps4-tli-check.yaml | 20 +++++++++++++++---- .../Analysis/TargetLibraryInfoTest.cpp | 3 +++ 7 files changed, 58 insertions(+), 13 deletions(-) diff --git a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c index b94f9641decc..8e5f015647e4 100644 --- a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c +++ b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c @@ -153,39 +153,39 @@ _Complex long double test_cargl(_Complex long double cld) { int ilogbl(long double a); // CHECK-LABEL: define dso_local i32 @test_ilogb( -// CHECK-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK: [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA2]] // // CHECK-WIN64-LABEL: define dso_local i32 @test_ilogb( -// CHECK-WIN64-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-WIN64-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-WIN64: [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA2]] // // CHECK-I686-LABEL: define dso_local i32 @test_ilogb( -// CHECK-I686-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-I686-SAME: x86_fp80 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-I686: [[CALL:%.*]] = tail call i32 @ilogbl(x86_fp80 noundef [[A]]) #[[ATTR5]], !tbaa [[TBAA3]] // // CHECK-PPC-LABEL: define dso_local i32 @test_ilogb( -// CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-PPC: [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]] // // CHECK-ARM-LABEL: define dso_local i32 @test_ilogb( -// CHECK-ARM-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-ARM-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-ARM: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] // // CHECK-ARM-HF-LABEL: define dso_local i32 @test_ilogb( -// CHECK-ARM-HF-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-ARM-HF-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-ARM-HF: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] // // CHECK-THUMB-LABEL: define i32 @test_ilogb( -// CHECK-THUMB-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-THUMB-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-THUMB: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] // // CHECK-AARCH-LABEL: define dso_local i32 @test_ilogb( -// CHECK-AARCH-SAME: fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-AARCH-SAME: fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-AARCH: [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA2]] // // CHECK-SPIR-LABEL: define dso_local spir_func i32 @test_ilogb( -// CHECK-SPIR-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-SPIR-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-SPIR: [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]] // // CHECK-MINGW32-LABEL: define dso_local i32 @test_ilogb( diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index 9b9affd41809..d472cde3d504 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -1751,6 +1751,21 @@ TLI_DEFINE_ENUM_INTERNAL(log2l) TLI_DEFINE_STRING_INTERNAL("log2l") TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl) +/// int ilogb(double x); +TLI_DEFINE_ENUM_INTERNAL(ilogb) +TLI_DEFINE_STRING_INTERNAL("ilogb") +TLI_DEFINE_SIG_INTERNAL(Int, Dbl) + +/// int ilogbf(float x); +TLI_DEFINE_ENUM_INTERNAL(ilogbf) +TLI_DEFINE_STRING_INTERNAL("ilogbf") +TLI_DEFINE_SIG_INTERNAL(Int, Flt) + +/// int ilogbl(long double x); +TLI_DEFINE_ENUM_INTERNAL(ilogbl) +TLI_DEFINE_STRING_INTERNAL("ilogbl") +TLI_DEFINE_SIG_INTERNAL(Int, LDbl) + /// double logb(double x); TLI_DEFINE_ENUM_INTERNAL(logb) TLI_DEFINE_STRING_INTERNAL("logb") diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 1785d77bca98..d9651d2f47c6 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -372,6 +372,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_log2); TLI.setUnavailable(LibFunc_log2f); TLI.setAvailableWithName(LibFunc_logb, "_logb"); + TLI.setUnavailable(LibFunc_ilogb); + TLI.setUnavailable(LibFunc_ilogbf); if (hasPartialFloat) TLI.setAvailableWithName(LibFunc_logbf, "_logbf"); else @@ -398,6 +400,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_log1pl); TLI.setUnavailable(LibFunc_log2l); TLI.setUnavailable(LibFunc_logbl); + TLI.setUnavailable(LibFunc_ilogbl); TLI.setUnavailable(LibFunc_nearbyintl); TLI.setUnavailable(LibFunc_rintl); TLI.setUnavailable(LibFunc_roundl); diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 7bb4b55fcb7c..c97a77d12e3e 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1229,6 +1229,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_logb: case LibFunc_logbf: case LibFunc_logbl: + case LibFunc_ilogb: + case LibFunc_ilogbf: + case LibFunc_ilogbl: case LibFunc_logf: case LibFunc_logl: case LibFunc_nearbyint: diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index 7c33d4765f6d..8567cc00ed00 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -643,6 +643,15 @@ declare float @log2f(float) ; CHECK: declare x86_fp80 @log2l(x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] declare x86_fp80 @log2l(x86_fp80) +; CHECK: declare i32 @ilogb(double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare i32 @ilogb(double) + +; CHECK: declare i32 @ilogbf(float) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare i32 @ilogbf(float) + +; CHECK: declare i32 @ilogbl(x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare i32 @ilogbl(x86_fp80) + ; CHECK: declare double @logb(double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] declare double @logb(double) diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index 3eb6d8b8eea9..aad5794fd8c2 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -34,7 +34,7 @@ # # CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 -# CHECK: == Total TLI yes SDK yes: 256 +# CHECK: == Total TLI yes SDK yes: 259 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) @@ -48,14 +48,14 @@ # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' # WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} -# WRONG_SUMMARY: == Total TLI yes SDK yes: 255 +# WRONG_SUMMARY: == Total TLI yes SDK yes: 258 # ## The -COUNT suffix doesn't care if there are too many matches, so check ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 507 symbols, 274 available -# AVAIL-COUNT-274: {{^}} available +# AVAIL: TLI knows 510 symbols, 277 available +# AVAIL-COUNT-277: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-233: not available # UNAVAIL-NOT: not available @@ -654,6 +654,18 @@ DynamicSymbols: Type: STT_FUNC Section: .text Binding: STB_GLOBAL + - Name: ilogb + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: ilogbf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: ilogbl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL - Name: logb Type: STT_FUNC Section: .text diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp index 4975651b1e50..b4856b50bbe5 100644 --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -266,6 +266,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) { "declare double @log2(double)\n" "declare float @log2f(float)\n" "declare x86_fp80 @log2l(x86_fp80)\n" + "declare i32 @ilogb(double)\n" + "declare i32 @ilogbf(float)\n" + "declare i32 @ilogbl(x86_fp80)\n" "declare double @logb(double)\n" "declare float @logbf(float)\n" "declare x86_fp80 @logbl(x86_fp80)\n" -- GitLab From 7be4ab0a86f9a52f1b49dad5665617441ec24a2e Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Fri, 18 Oct 2024 11:55:57 +0530 Subject: [PATCH 002/511] [libc][complex] Added support for CFP16 and CFP128 (#112594) Fixes: #112217 --- libc/include/llvm-libc-types/CMakeLists.txt | 8 ++++ libc/include/llvm-libc-types/cfloat128.h | 38 +++++++++++++++++++ libc/include/llvm-libc-types/cfloat16.h | 20 ++++++++++ libc/src/__support/CPP/CMakeLists.txt | 2 + libc/src/__support/CPP/type_traits.h | 1 - .../__support/CPP/type_traits/is_complex.h | 15 +++++++- .../macros/properties/CMakeLists.txt | 10 +++++ .../macros/properties/complex_types.h | 25 ++++++++++++ libc/test/UnitTest/FPMatcher.h | 17 +++++++++ 9 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 libc/include/llvm-libc-types/cfloat128.h create mode 100644 libc/include/llvm-libc-types/cfloat16.h create mode 100644 libc/src/__support/macros/properties/complex_types.h diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index a4cf4631c847..836e8a507bd6 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -134,6 +134,14 @@ add_header( DEPENDS libc.include.llvm-libc-macros.float_macros ) +add_header( + cfloat128 + HDR + cfloat128.h + DEPENDS + libc.include.llvm-libc-macros.float_macros +) +add_header(cfloat16 HDR cfloat16.h) add_header(fsblkcnt_t HDR fsblkcnt_t.h) add_header(fsfilcnt_t HDR fsfilcnt_t.h) add_header( diff --git a/libc/include/llvm-libc-types/cfloat128.h b/libc/include/llvm-libc-types/cfloat128.h new file mode 100644 index 000000000000..0cc8ed3041d6 --- /dev/null +++ b/libc/include/llvm-libc-types/cfloat128.h @@ -0,0 +1,38 @@ +//===-- Definition of cfloat128 type --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_CFLOAT128_H +#define LLVM_LIBC_TYPES_CFLOAT128_H + +#include "../llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG + +// Currently, the complex variant of C23 `_Float128` type is only defined as a +// built-in type in GCC 7 or later, and only for C. For C++, or for clang, +// the complex variant of `__float128` is defined instead, and only on x86-64 +// targets. +// +// TODO: Update the complex variant of C23 `_Float128` type detection again when +// clang supports it. +// https://github.com/llvm/llvm-project/issues/80195 +#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__) && \ + !defined(__cplusplus) +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex _Float128 cfloat128; +#elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__) +// Use _Complex __float128 type. gcc and clang sometime use __SIZEOF_FLOAT128__ +// to notify the availability of __float128. clang also uses __FLOAT128__ macro +// to notify the availability of __float128 type: +// https://reviews.llvm.org/D15120 +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex __float128 cfloat128; +#elif (LDBL_MANT_DIG == 113) +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex long double cfloat128; +#endif + +#endif // LLVM_LIBC_TYPES_CFLOAT128_H diff --git a/libc/include/llvm-libc-types/cfloat16.h b/libc/include/llvm-libc-types/cfloat16.h new file mode 100644 index 000000000000..e7e5631e0250 --- /dev/null +++ b/libc/include/llvm-libc-types/cfloat16.h @@ -0,0 +1,20 @@ +//===-- Definition of cfloat16 type ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_CFLOAT16_H +#define LLVM_LIBC_TYPES_CFLOAT16_H + +#if defined(__FLT16_MANT_DIG__) && \ + (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__)) && \ + !defined(__arm__) && !defined(_M_ARM) && !defined(__riscv) && \ + !defined(_WIN32) +#define LIBC_TYPES_HAS_CFLOAT16 +typedef _Complex _Float16 cfloat16; +#endif + +#endif // LLVM_LIBC_TYPES_CFLOAT16_H diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt index c1981b827042..774668be42e5 100644 --- a/libc/src/__support/CPP/CMakeLists.txt +++ b/libc/src/__support/CPP/CMakeLists.txt @@ -126,6 +126,7 @@ add_header_library( type_traits/is_array.h type_traits/is_base_of.h type_traits/is_class.h + type_traits/is_complex.h type_traits/is_const.h type_traits/is_constant_evaluated.h type_traits/is_convertible.h @@ -165,6 +166,7 @@ add_header_library( libc.include.llvm-libc-macros.stdfix_macros libc.src.__support.macros.attributes libc.src.__support.macros.properties.types + libc.src.__support.macros.properties.complex_types ) add_header_library( diff --git a/libc/src/__support/CPP/type_traits.h b/libc/src/__support/CPP/type_traits.h index cef4e5d1f0b1..d50b6612656d 100644 --- a/libc/src/__support/CPP/type_traits.h +++ b/libc/src/__support/CPP/type_traits.h @@ -25,7 +25,6 @@ #include "src/__support/CPP/type_traits/is_array.h" #include "src/__support/CPP/type_traits/is_base_of.h" #include "src/__support/CPP/type_traits/is_class.h" -#include "src/__support/CPP/type_traits/is_complex.h" #include "src/__support/CPP/type_traits/is_const.h" #include "src/__support/CPP/type_traits/is_constant_evaluated.h" #include "src/__support/CPP/type_traits/is_convertible.h" diff --git a/libc/src/__support/CPP/type_traits/is_complex.h b/libc/src/__support/CPP/type_traits/is_complex.h index 4f5ee9abdb33..23f05c08ccab 100644 --- a/libc/src/__support/CPP/type_traits/is_complex.h +++ b/libc/src/__support/CPP/type_traits/is_complex.h @@ -10,6 +10,10 @@ #include "src/__support/CPP/type_traits/is_same.h" #include "src/__support/CPP/type_traits/remove_cv.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" +// LIBC_TYPES_HAS_CFLOAT16 && LIBC_TYPES_HAS_CFLOAT128 +#include "src/__support/macros/properties/complex_types.h" namespace LIBC_NAMESPACE_DECL { namespace cpp { @@ -25,7 +29,16 @@ private: public: LIBC_INLINE_VAR static constexpr bool value = __is_unqualified_any_of(); + _Complex long double +#ifdef LIBC_TYPES_HAS_CFLOAT16 + , + cfloat16 +#endif +#ifdef LIBC_TYPES_HAS_CFLOAT128 + , + cfloat128 +#endif + >(); }; template LIBC_INLINE_VAR constexpr bool is_complex_v = is_complex::value; diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt index c69f3a85d728..80ed63a2fbcf 100644 --- a/libc/src/__support/macros/properties/CMakeLists.txt +++ b/libc/src/__support/macros/properties/CMakeLists.txt @@ -37,3 +37,13 @@ add_header_library( libc.include.llvm-libc-macros.float16_macros libc.include.llvm-libc-types.float128 ) + +add_header_library( + complex_types + HDRS + complex_types.h + DEPENDS + .types + libc.include.llvm-libc-types.cfloat16 + libc.include.llvm-libc-types.cfloat128 +) diff --git a/libc/src/__support/macros/properties/complex_types.h b/libc/src/__support/macros/properties/complex_types.h new file mode 100644 index 000000000000..3f4a7646649c --- /dev/null +++ b/libc/src/__support/macros/properties/complex_types.h @@ -0,0 +1,25 @@ +//===-- Complex Types support -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Complex Types detection and support. + +#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CTYPES_H +#define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CTYPES_H + +#include "include/llvm-libc-types/cfloat128.h" +#include "include/llvm-libc-types/cfloat16.h" +#include "types.h" + +// -- cfloat16 support -------------------------------------------------------- +// LIBC_TYPES_HAS_CFLOAT16 and 'cfloat16' type is provided by +// "include/llvm-libc-types/cfloat16.h" + +// -- cfloat128 support ------------------------------------------------------- +// LIBC_TYPES_HAS_CFLOAT128 and 'cfloat128' type are provided by +// "include/llvm-libc-types/cfloat128.h" + +#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CTYPES_H diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index 5220b1245bf3..07e2cd5df18c 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -11,6 +11,7 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/type_traits.h" +#include "src/__support/CPP/type_traits/is_complex.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/fpbits_str.h" @@ -128,6 +129,14 @@ public: return matchComplex(); else if (cpp::is_complex_type_same()) return matchComplex(); +#ifdef LIBC_TYPES_HAS_CFLOAT16 + else if (cpp::is_complex_type_same) + return matchComplex(); +#endif +#ifdef LIBC_TYPES_HAS_CFLOAT128 + else if (cpp::is_complex_type_same) + return matchComplex(); +#endif } void explainError() override { @@ -137,6 +146,14 @@ public: return explainErrorComplex(); else if (cpp::is_complex_type_same()) return explainErrorComplex(); +#ifdef LIBC_TYPES_HAS_CFLOAT16 + else if (cpp::is_complex_type_same) + return explainErrorComplex(); +#endif +#ifdef LIBC_TYPES_HAS_CFLOAT128 + else if (cpp::is_complex_type_same) + return explainErrorComplex(); +#endif } }; -- GitLab From eb446eb4f71f0d4da3840ad7d77af5da59838f38 Mon Sep 17 00:00:00 2001 From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:34:33 +0800 Subject: [PATCH 003/511] [MLIR][BUILD] Fix Unicode build issue on Windows. (#112300) This issue is from https://github.com/llvm/llvm-project/pull/77668. I encountered a build issue because it used Unicode. When I built MLIR on Windows with Visual Studio 2022, I faced a build failure. --------- Co-authored-by: Harrison Hao --- mlir/test/mlir-rewrite/simple.mlir | 3 +-- mlir/tools/mlir-rewrite/mlir-rewrite.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/mlir/test/mlir-rewrite/simple.mlir b/mlir/test/mlir-rewrite/simple.mlir index ab6bfe24fccf..66f17f093f50 100644 --- a/mlir/test/mlir-rewrite/simple.mlir +++ b/mlir/test/mlir-rewrite/simple.mlir @@ -4,8 +4,7 @@ func.func @two_dynamic_one_direct_shape(%arg0: tensor, %arg1: tensor<2x4x?xf32>) -> tensor { // RENAME: "test.concat"({{.*}}) {bxis = 0 : i64} - // RANGE: 《%{{.*}} = 〖"test.concat"〗({{.*}}) {axis = 0 : i64} : (tensor, tensor<2x4x?xf32>) -> tensor》 + // RANGE: <%{{.*}} = ["test.concat"]({{.*}}) {axis = 0 : i64} : (tensor, tensor<2x4x?xf32>) -> tensor> %5 = "test.concat"(%arg0, %arg1) {axis = 0 : i64} : (tensor, tensor<2x4x?xf32>) -> tensor return %5 : tensor } - diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp index 308e6490726c..e70aa5d41aa0 100644 --- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp +++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp @@ -320,25 +320,25 @@ LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) { for (auto it : rewriteState.getOpDefs()) { auto [startOp, endOp] = getOpRange(it); - rewriteState.insertText(startOp, "《"); - rewriteState.insertText(endOp, "》"); + rewriteState.insertText(startOp, "<"); + rewriteState.insertText(endOp, ">"); auto nameRange = getOpNameRange(it); if (isGeneric(it)) { - rewriteState.insertText(nameRange.Start, "〖"); - rewriteState.insertText(nameRange.End, "〗"); + rewriteState.insertText(nameRange.Start, "["); + rewriteState.insertText(nameRange.End, "]"); } else { - rewriteState.insertText(nameRange.Start, "〔"); - rewriteState.insertText(nameRange.End, "〕"); + rewriteState.insertText(nameRange.Start, "!["); + rewriteState.insertText(nameRange.End, "]!"); } } // Highlight all comment lines. // TODO: Could be replaced if this is kept in memory. for (auto commentLine : rewriteState.getSingleLineComments()) { - rewriteState.insertText(commentLine.Start, "❰"); - rewriteState.insertText(commentLine.End, "❱"); + rewriteState.insertText(commentLine.Start, "{"); + rewriteState.insertText(commentLine.End, "}"); } return success(); -- GitLab From e8509a43acb286181aa84f8035ece3b59562cd10 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Fri, 18 Oct 2024 14:39:54 +0800 Subject: [PATCH 004/511] [RISCV] Check if v extension is enabled by the function features for the builtins not in Zve64*. (#112827) Fixes: https://github.com/llvm/llvm-project/issues/109694 --- clang/lib/Sema/SemaRISCV.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index 3da4b515b1b1..d1ccc2774152 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -623,7 +623,12 @@ bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI, ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo( TheCall->getType()->castAs()); - if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v")) + const FunctionDecl *FD = SemaRef.getCurFunctionDecl(); + llvm::StringMap FunctionFeatureMap; + Context.getFunctionFeatureMap(FunctionFeatureMap, FD); + + if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v") && + !FunctionFeatureMap.lookup("v")) return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_requires_extension) << /* IsExtension */ true << TheCall->getSourceRange() << "v"; -- GitLab From 18b50189a749a39d1ac61a72af1d103f68fefc6b Mon Sep 17 00:00:00 2001 From: Julian Schmidt Date: Fri, 18 Oct 2024 09:07:21 +0200 Subject: [PATCH 005/511] [clang-tidy] rewrite matchers in modernize-use-starts-ends-with (#112101) Rewrite the AST matchers for slightly more composability. Furthermore, check that the `starts_with` and `ends_with` functions return a `bool`. There is one behavioral change, in that the methods of a class (and transitive classes) are searched once for a matching `starts_with`/`ends_with` function, picking the first it can find. Previously, the matchers would try to find `starts_with`, then `startsWith`, and finally, `startswith`. Now, the first of the three that is encountered will be the matched method. --------- Co-authored-by: Nicolas van Kempen --- .../modernize/UseStartsEndsWithCheck.cpp | 82 +++++++++---------- .../modernize/use-starts-ends-with.cpp | 13 +-- 2 files changed, 39 insertions(+), 56 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp index 5eb3267adb07..1231f954298a 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp @@ -9,7 +9,8 @@ #include "UseStartsEndsWithCheck.h" #include "../utils/ASTUtils.h" -#include "../utils/OptionsUtils.h" +#include "../utils/Matchers.h" +#include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Lex/Lexer.h" #include @@ -82,60 +83,53 @@ UseStartsEndsWithCheck::UseStartsEndsWithCheck(StringRef Name, void UseStartsEndsWithCheck::registerMatchers(MatchFinder *Finder) { const auto ZeroLiteral = integerLiteral(equals(0)); - const auto HasStartsWithMethodWithName = [](const std::string &Name) { - return hasMethod( - cxxMethodDecl(hasName(Name), isConst(), parameterCountIs(1)) - .bind("starts_with_fun")); + const auto ClassTypeWithMethod = [](const StringRef MethodBoundName, + const auto... Methods) { + return cxxRecordDecl(anyOf( + hasMethod(cxxMethodDecl(isConst(), parameterCountIs(1), + returns(booleanType()), hasAnyName(Methods)) + .bind(MethodBoundName))...)); }; - const auto HasStartsWithMethod = - anyOf(HasStartsWithMethodWithName("starts_with"), - HasStartsWithMethodWithName("startsWith"), - HasStartsWithMethodWithName("startswith")); + const auto OnClassWithStartsWithFunction = - on(hasType(hasCanonicalType(hasDeclaration(cxxRecordDecl( - anyOf(HasStartsWithMethod, - hasAnyBase(hasType(hasCanonicalType( - hasDeclaration(cxxRecordDecl(HasStartsWithMethod))))))))))); - - const auto HasEndsWithMethodWithName = [](const std::string &Name) { - return hasMethod( - cxxMethodDecl(hasName(Name), isConst(), parameterCountIs(1)) - .bind("ends_with_fun")); - }; - const auto HasEndsWithMethod = anyOf(HasEndsWithMethodWithName("ends_with"), - HasEndsWithMethodWithName("endsWith"), - HasEndsWithMethodWithName("endswith")); - const auto OnClassWithEndsWithFunction = - on(expr(hasType(hasCanonicalType(hasDeclaration(cxxRecordDecl( - anyOf(HasEndsWithMethod, - hasAnyBase(hasType(hasCanonicalType(hasDeclaration( - cxxRecordDecl(HasEndsWithMethod))))))))))) - .bind("haystack")); + ClassTypeWithMethod("starts_with_fun", "starts_with", "startsWith", + "startswith", "StartsWith"); + + const auto OnClassWithEndsWithFunction = ClassTypeWithMethod( + "ends_with_fun", "ends_with", "endsWith", "endswith", "EndsWith"); // Case 1: X.find(Y) [!=]= 0 -> starts_with. const auto FindExpr = cxxMemberCallExpr( anyOf(argumentCountIs(1), hasArgument(1, ZeroLiteral)), - callee(cxxMethodDecl(hasName("find")).bind("find_fun")), - OnClassWithStartsWithFunction, hasArgument(0, expr().bind("needle"))); + callee( + cxxMethodDecl(hasName("find"), ofClass(OnClassWithStartsWithFunction)) + .bind("find_fun")), + hasArgument(0, expr().bind("needle"))); // Case 2: X.rfind(Y, 0) [!=]= 0 -> starts_with. const auto RFindExpr = cxxMemberCallExpr( hasArgument(1, ZeroLiteral), - callee(cxxMethodDecl(hasName("rfind")).bind("find_fun")), - OnClassWithStartsWithFunction, hasArgument(0, expr().bind("needle"))); + callee(cxxMethodDecl(hasName("rfind"), + ofClass(OnClassWithStartsWithFunction)) + .bind("find_fun")), + hasArgument(0, expr().bind("needle"))); // Case 3: X.compare(0, LEN(Y), Y) [!=]= 0 -> starts_with. const auto CompareExpr = cxxMemberCallExpr( argumentCountIs(3), hasArgument(0, ZeroLiteral), - callee(cxxMethodDecl(hasName("compare")).bind("find_fun")), - OnClassWithStartsWithFunction, hasArgument(2, expr().bind("needle")), + callee(cxxMethodDecl(hasName("compare"), + ofClass(OnClassWithStartsWithFunction)) + .bind("find_fun")), + hasArgument(2, expr().bind("needle")), hasArgument(1, lengthExprForStringNode("needle"))); // Case 4: X.compare(LEN(X) - LEN(Y), LEN(Y), Y) [!=]= 0 -> ends_with. const auto CompareEndsWithExpr = cxxMemberCallExpr( argumentCountIs(3), - callee(cxxMethodDecl(hasName("compare")).bind("find_fun")), - OnClassWithEndsWithFunction, hasArgument(2, expr().bind("needle")), + callee(cxxMethodDecl(hasName("compare"), + ofClass(OnClassWithEndsWithFunction)) + .bind("find_fun")), + on(expr().bind("haystack")), hasArgument(2, expr().bind("needle")), hasArgument(1, lengthExprForStringNode("needle")), hasArgument(0, binaryOperator(hasOperatorName("-"), @@ -145,7 +139,7 @@ void UseStartsEndsWithCheck::registerMatchers(MatchFinder *Finder) { // All cases comparing to 0. Finder->addMatcher( binaryOperator( - hasAnyOperatorName("==", "!="), + matchers::isEqualityOperator(), hasOperands(cxxMemberCallExpr(anyOf(FindExpr, RFindExpr, CompareExpr, CompareEndsWithExpr)) .bind("find_expr"), @@ -156,7 +150,7 @@ void UseStartsEndsWithCheck::registerMatchers(MatchFinder *Finder) { // Case 5: X.rfind(Y) [!=]= LEN(X) - LEN(Y) -> ends_with. Finder->addMatcher( binaryOperator( - hasAnyOperatorName("==", "!="), + matchers::isEqualityOperator(), hasOperands( cxxMemberCallExpr( anyOf( @@ -166,8 +160,10 @@ void UseStartsEndsWithCheck::registerMatchers(MatchFinder *Finder) { 1, anyOf(declRefExpr(to(varDecl(hasName("npos")))), memberExpr(member(hasName("npos"))))))), - callee(cxxMethodDecl(hasName("rfind")).bind("find_fun")), - OnClassWithEndsWithFunction, + callee(cxxMethodDecl(hasName("rfind"), + ofClass(OnClassWithEndsWithFunction)) + .bind("find_fun")), + on(expr().bind("haystack")), hasArgument(0, expr().bind("needle"))) .bind("find_expr"), binaryOperator(hasOperatorName("-"), @@ -190,9 +186,8 @@ void UseStartsEndsWithCheck::check(const MatchFinder::MatchResult &Result) { const CXXMethodDecl *ReplacementFunction = StartsWithFunction ? StartsWithFunction : EndsWithFunction; - if (ComparisonExpr->getBeginLoc().isMacroID()) { + if (ComparisonExpr->getBeginLoc().isMacroID()) return; - } const bool Neg = ComparisonExpr->getOpcode() == BO_NE; @@ -220,9 +215,8 @@ void UseStartsEndsWithCheck::check(const MatchFinder::MatchResult &Result) { (ReplacementFunction->getName() + "(").str()); // Add possible negation '!'. - if (Neg) { + if (Neg) Diagnostic << FixItHint::CreateInsertion(FindExpr->getBeginLoc(), "!"); - } } } // namespace clang::tidy::modernize diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp index 798af260a3b6..91477241e82e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp @@ -32,14 +32,9 @@ struct prefer_underscore_version_flip { size_t find(const char *s, size_t pos = 0) const; }; -struct prefer_underscore_version_inherit : public string_like { - bool startsWith(const char *s) const; -}; - void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, string_like sl, string_like_camel slc, prefer_underscore_version puv, - prefer_underscore_version_flip puvf, - prefer_underscore_version_inherit puvi) { + prefer_underscore_version_flip puvf) { s.find("a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with instead of find() == 0 // CHECK-FIXES: s.starts_with("a"); @@ -153,12 +148,6 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with // CHECK-FIXES: puvf.starts_with("a"); - // Here, the subclass has startsWith, the superclass has starts_with. - // We prefer the version from the subclass. - puvi.find("a") == 0; - // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use startsWith - // CHECK-FIXES: puvi.startsWith("a"); - s.compare(0, 1, "a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with instead of compare() == 0 // CHECK-FIXES: s.starts_with("a"); -- GitLab From 5a09ce9e038ed73ea60edf5463dd6509f7c4848f Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Fri, 18 Oct 2024 09:10:05 +0200 Subject: [PATCH 006/511] [OpenCL] Replace a CreatePointerCast call; NFC (#112676) With opaque pointers, the only purpose of the cast here is to cast between address spaces, similar to the 4-argument case below. --- clang/lib/CodeGen/CGBuiltin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a048a566a092..28f28c70b5ae 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5636,10 +5636,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty}; llvm::FunctionType *FTy = llvm::FunctionType::get( Int32Ty, llvm::ArrayRef(ArgTys), false); - Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy); + Value *ACast = Builder.CreateAddrSpaceCast(Arg1, I8PTy); return RValue::get( EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), - {Arg0, BCast, PacketSize, PacketAlign})); + {Arg0, ACast, PacketSize, PacketAlign})); } else { assert(4 == E->getNumArgs() && "Illegal number of parameters to pipe function"); -- GitLab From ba1ee2bab7a4cdc0975686e5099461c0b12c5345 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Oct 2024 09:23:24 +0200 Subject: [PATCH 007/511] [APInt] Enable APInt ctor assertion by default (#112670) This enables the assertion introduced in https://github.com/llvm/llvm-project/pull/106524, which checks that the value passed to the APInt constructor is indeed a valid N-bit signed or unsigned integer. Places that previously violated the assertion were updated in advance, e.g. in https://github.com/llvm/llvm-project/pull/80309. It is possible to opt-out of the check and restore the previous behavior by setting implicitTrunc=true. --- llvm/include/llvm/ADT/APInt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 63a138527b32..953b2a27b715 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -109,7 +109,7 @@ public: /// \param implicitTrunc allow implicit truncation of non-zero/sign bits of /// val beyond the range of numBits APInt(unsigned numBits, uint64_t val, bool isSigned = false, - bool implicitTrunc = true) + bool implicitTrunc = false) : BitWidth(numBits) { if (!implicitTrunc) { if (isSigned) { -- GitLab From e6a4346b5a105c2f28349270c3a82935c9a84d16 Mon Sep 17 00:00:00 2001 From: Scott Manley Date: Fri, 18 Oct 2024 02:29:25 -0500 Subject: [PATCH 008/511] [flang] add getElementType() to fir::SquenceType and fir::VectorType (#112770) getElementType() was missing from Sequence and Vector types. Did a replace of the obvious places getEleTy() was used for these two types and updated to use this name instead. Co-authored-by: Scott Manley --- .../Optimizer/Builder/PPCIntrinsicCall.h | 2 +- .../flang/Optimizer/Dialect/FIRTypes.td | 4 ++++ flang/lib/Lower/ConvertConstant.cpp | 7 +++--- flang/lib/Lower/ConvertExpr.cpp | 8 +++---- flang/lib/Lower/ConvertExprToHLFIR.cpp | 5 ++-- flang/lib/Lower/ConvertVariable.cpp | 2 +- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 ++-- .../Optimizer/Builder/PPCIntrinsicCall.cpp | 2 +- .../Optimizer/Builder/Runtime/Reduction.cpp | 24 +++++++++---------- .../Builder/Runtime/Transformational.cpp | 8 +++---- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 4 ++-- flang/lib/Optimizer/Dialect/FIROps.cpp | 6 ++--- 12 files changed, 41 insertions(+), 35 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h index a7c4c075d818..5ae32f70a11a 100644 --- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h @@ -182,7 +182,7 @@ struct VecTypeInfo { static inline VecTypeInfo getVecTypeFromFirType(mlir::Type firTy) { assert(mlir::isa(firTy)); VecTypeInfo vecTyInfo; - vecTyInfo.eleTy = mlir::dyn_cast(firTy).getEleTy(); + vecTyInfo.eleTy = mlir::dyn_cast(firTy).getElementType(); vecTyInfo.len = mlir::dyn_cast(firTy).getLen(); return vecTyInfo; } diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td index 7ac8e0822ecc..bfd00c345583 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td +++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td @@ -465,6 +465,8 @@ def fir_SequenceType : FIR_Type<"Sequence", "array"> { size = size * static_cast(extent); return size; } + + mlir::Type getElementType() const { return getEleTy(); } }]; } @@ -519,6 +521,8 @@ def fir_VectorType : FIR_Type<"Vector", "vector"> { let extraClassDeclaration = [{ static bool isValidElementType(mlir::Type t); + + mlir::Type getElementType() const { return getEleTy(); } }]; let skipDefaultBuilders = 1; diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp index 748be508235f..556b330b967c 100644 --- a/flang/lib/Lower/ConvertConstant.cpp +++ b/flang/lib/Lower/ConvertConstant.cpp @@ -584,7 +584,8 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter, } while (con.IncrementSubscripts(subscripts)); } else if constexpr (T::category == Fortran::common::TypeCategory::Derived) { do { - mlir::Type eleTy = mlir::cast(arrayTy).getEleTy(); + mlir::Type eleTy = + mlir::cast(arrayTy).getElementType(); mlir::Value elementVal = genScalarLit(converter, loc, con.At(subscripts), eleTy, /*outlineInReadOnlyMemory=*/false); @@ -594,7 +595,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter, } else { llvm::SmallVector rangeStartIdx; uint64_t rangeSize = 0; - mlir::Type eleTy = mlir::cast(arrayTy).getEleTy(); + mlir::Type eleTy = mlir::cast(arrayTy).getElementType(); do { auto getElementVal = [&]() { return builder.createConvert(loc, eleTy, @@ -643,7 +644,7 @@ genOutlineArrayLit(Fortran::lower::AbstractConverter &converter, mlir::Location loc, mlir::Type arrayTy, const Fortran::evaluate::Constant &constant) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - mlir::Type eleTy = mlir::cast(arrayTy).getEleTy(); + mlir::Type eleTy = mlir::cast(arrayTy).getElementType(); llvm::StringRef globalName = converter.getUniqueLitName( loc, std::make_unique(toEvExpr(constant)), eleTy); diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index 87e2114e4130..46168b81dd3a 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -1574,7 +1574,7 @@ public: mlir::Location loc = getLoc(); mlir::Value addr = fir::getBase(array); mlir::Type arrTy = fir::dyn_cast_ptrEleTy(addr.getType()); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); mlir::Type seqTy = builder.getRefType(builder.getVarLenSeqTy(eleTy)); mlir::Type refTy = builder.getRefType(eleTy); mlir::Value base = builder.createConvert(loc, seqTy, addr); @@ -1659,7 +1659,7 @@ public: mlir::Location loc = getLoc(); mlir::Value addr = fir::getBase(exv); mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(addr.getType()); - mlir::Type eleTy = mlir::cast(arrTy).getEleTy(); + mlir::Type eleTy = mlir::cast(arrTy).getElementType(); mlir::Type refTy = builder.getRefType(eleTy); mlir::IndexType idxTy = builder.getIndexType(); llvm::SmallVector arrayCoorArgs; @@ -4145,7 +4145,7 @@ private: mlir::Location loc = getLoc(); return [=, builder = &converter.getFirOpBuilder()](IterSpace iters) { mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(tmp.getType()); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); mlir::Type eleRefTy = builder->getRefType(eleTy); mlir::IntegerType i1Ty = builder->getI1Type(); // Adjust indices for any shift of the origin of the array. @@ -5759,7 +5759,7 @@ private: return fir::BoxValue(embox, lbounds, nonDeferredLenParams); }; } - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); if (isReferentiallyOpaque()) { // Semantics are an opaque reference to an array. // This case forwards a continuation that will generate the address diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp index 93b78fd3357f..e93fbc562f9b 100644 --- a/flang/lib/Lower/ConvertExprToHLFIR.cpp +++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp @@ -579,7 +579,8 @@ private: return createVectorSubscriptElementAddrOp(partInfo, baseType, resultExtents); - mlir::Type resultType = mlir::cast(baseType).getEleTy(); + mlir::Type resultType = + mlir::cast(baseType).getElementType(); if (!resultTypeShape.empty()) { // Ranked array section. The result shape comes from the array section // subscripts. @@ -811,7 +812,7 @@ private: } } builder.setInsertionPoint(elementalAddrOp); - return mlir::cast(baseType).getEleTy(); + return mlir::cast(baseType).getElementType(); } /// Yield the designator for the final part-ref inside the diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 8b03d60e47ca..cc51d5a9bb8d 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -518,7 +518,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, // type does not support nested structures. if (mlir::isa(symTy) && !Fortran::semantics::IsAllocatableOrPointer(sym)) { - mlir::Type eleTy = mlir::cast(symTy).getEleTy(); + mlir::Type eleTy = mlir::cast(symTy).getElementType(); if (mlir::isa(eleTy)) { const auto *details = diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e6143275ce1d..462193a850c4 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3824,7 +3824,7 @@ IntrinsicLibrary::genReduction(FN func, FD funcDim, llvm::StringRef errMsg, if (absentDim || rank == 1) { mlir::Type ty = array.getType(); mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); if (fir::isa_complex(eleTy)) { mlir::Value result = builder.createTemporary(loc, eleTy); func(builder, loc, array, mask, result); @@ -6137,7 +6137,7 @@ IntrinsicLibrary::genReduce(mlir::Type resultType, mlir::Type ty = array.getType(); mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - mlir::Type eleTy = mlir::cast(arrTy).getEleTy(); + mlir::Type eleTy = mlir::cast(arrTy).getElementType(); // Handle optional arguments bool absentDim = isStaticallyAbsent(args[2]); diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp index 7f09e8822844..b3b07d18a956 100644 --- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp @@ -2797,7 +2797,7 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef args) { if (vType != targetType) { if (mlir::isa(targetType)) { // Perform vector type conversion for arguments passed by value. - auto eleTy{mlir::dyn_cast(vType).getEleTy()}; + auto eleTy{mlir::dyn_cast(vType).getElementType()}; auto len{mlir::dyn_cast(vType).getLen()}; mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy); auto v0{builder.createConvert(loc, mlirType, v)}; diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp index b39824428c78..f6627dff671e 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp @@ -1157,7 +1157,7 @@ void fir::runtime::genMaxloc(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value back) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); fir::factory::CharacterExprHelper charHelper{builder, loc}; auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); mlir::func::FuncOp func; @@ -1189,7 +1189,7 @@ mlir::Value fir::runtime::genMaxval(fir::FirOpBuilder &builder, mlir::Value maskBox) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); mlir::func::FuncOp func; @@ -1241,7 +1241,7 @@ void fir::runtime::genMinloc(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value back) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); mlir::func::FuncOp func; REAL_INTRINSIC_INSTANCES(Minloc, ) @@ -1298,7 +1298,7 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder, mlir::Value maskBox) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); @@ -1326,7 +1326,7 @@ void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc, mlir::func::FuncOp func; auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); if (eleTy.isF128()) func = fir::runtime::getRuntimeFunc(loc, builder); else @@ -1348,7 +1348,7 @@ mlir::Value fir::runtime::genNorm2(fir::FirOpBuilder &builder, mlir::func::FuncOp func; auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); if (eleTy.isF32()) @@ -1398,7 +1398,7 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder, mlir::Value resultBox) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); @@ -1482,7 +1482,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); @@ -1521,7 +1521,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc, mlir::func::FuncOp func; \ auto ty = arrayBox.getType(); \ auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); \ - auto eleTy = mlir::cast(arrTy).getEleTy(); \ + auto eleTy = mlir::cast(arrTy).getElementType(); \ auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0); \ \ if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1))) \ @@ -1596,7 +1596,7 @@ void fir::runtime::genReduce(fir::FirOpBuilder &builder, mlir::Location loc, bool argByRef) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getI32Type(), 1); assert(resultBox && "expect non null value for the result"); @@ -1646,7 +1646,7 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder, bool argByRef) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto dim = builder.createIntegerConstant(loc, builder.getI32Type(), 1); assert((fir::isa_real(eleTy) || fir::isa_integer(eleTy) || @@ -1687,7 +1687,7 @@ void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox, bool argByRef) { auto ty = arrayBox.getType(); auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); - auto eleTy = mlir::cast(arrTy).getEleTy(); + auto eleTy = mlir::cast(arrTy).getElementType(); auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy); mlir::func::FuncOp func; diff --git a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp index 8f08b01fe009..50f14abd01c1 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp @@ -365,11 +365,11 @@ void fir::runtime::genMatmul(fir::FirOpBuilder &builder, mlir::Location loc, mlir::func::FuncOp func; auto boxATy = matrixABox.getType(); auto arrATy = fir::dyn_cast_ptrOrBoxEleTy(boxATy); - auto arrAEleTy = mlir::cast(arrATy).getEleTy(); + auto arrAEleTy = mlir::cast(arrATy).getElementType(); auto [aCat, aKind] = fir::mlirTypeToCategoryKind(loc, arrAEleTy); auto boxBTy = matrixBBox.getType(); auto arrBTy = fir::dyn_cast_ptrOrBoxEleTy(boxBTy); - auto arrBEleTy = mlir::cast(arrBTy).getEleTy(); + auto arrBEleTy = mlir::cast(arrBTy).getElementType(); auto [bCat, bKind] = fir::mlirTypeToCategoryKind(loc, arrBEleTy); #define MATMUL_INSTANCE(ACAT, AKIND, BCAT, BKIND) \ @@ -417,11 +417,11 @@ void fir::runtime::genMatmulTranspose(fir::FirOpBuilder &builder, mlir::func::FuncOp func; auto boxATy = matrixABox.getType(); auto arrATy = fir::dyn_cast_ptrOrBoxEleTy(boxATy); - auto arrAEleTy = mlir::cast(arrATy).getEleTy(); + auto arrAEleTy = mlir::cast(arrATy).getElementType(); auto [aCat, aKind] = fir::mlirTypeToCategoryKind(loc, arrAEleTy); auto boxBTy = matrixBBox.getType(); auto arrBTy = fir::dyn_cast_ptrOrBoxEleTy(boxBTy); - auto arrBEleTy = mlir::cast(arrBTy).getEleTy(); + auto arrBEleTy = mlir::cast(arrBTy).getElementType(); auto [bCat, bKind] = fir::mlirTypeToCategoryKind(loc, arrBEleTy); #define MATMUL_INSTANCE(ACAT, AKIND, BCAT, BKIND) \ diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 68b8c6613585..e6eeb0d5db4a 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2619,7 +2619,7 @@ private: dims = dimsLeft - 1; continue; } - cpnTy = mlir::cast(cpnTy).getEleTy(); + cpnTy = mlir::cast(cpnTy).getElementType(); // append array range in reverse (FIR arrays are column-major) offs.append(arrIdx.rbegin(), arrIdx.rend()); arrIdx.clear(); @@ -2633,7 +2633,7 @@ private: arrIdx.push_back(nxtOpnd); continue; } - cpnTy = mlir::cast(cpnTy).getEleTy(); + cpnTy = mlir::cast(cpnTy).getElementType(); offs.push_back(nxtOpnd); continue; } diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 90ce8b876059..cdcf9bda49a6 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1359,7 +1359,7 @@ bool fir::ConvertOp::isPointerCompatible(mlir::Type ty) { static std::optional getVectorElementType(mlir::Type ty) { mlir::Type elemTy; if (mlir::isa(ty)) - elemTy = mlir::dyn_cast(ty).getEleTy(); + elemTy = mlir::dyn_cast(ty).getElementType(); else if (mlir::isa(ty)) elemTy = mlir::dyn_cast(ty).getElementType(); else @@ -1533,7 +1533,7 @@ llvm::LogicalResult fir::CoordinateOp::verify() { } if (dimension) { if (--dimension == 0) - eleTy = mlir::cast(eleTy).getEleTy(); + eleTy = mlir::cast(eleTy).getElementType(); } else { if (auto t = mlir::dyn_cast(eleTy)) { // FIXME: Generally, we don't know which field of the tuple is being @@ -3817,7 +3817,7 @@ void fir::StoreOp::build(mlir::OpBuilder &builder, mlir::OperationState &result, //===----------------------------------------------------------------------===// inline fir::CharacterType::KindTy stringLitOpGetKind(fir::StringLitOp op) { - auto eleTy = mlir::cast(op.getType()).getEleTy(); + auto eleTy = mlir::cast(op.getType()).getElementType(); return mlir::cast(eleTy).getFKind(); } -- GitLab From 9698e57548c61d356f12cc42a8b4785e56f9ab51 Mon Sep 17 00:00:00 2001 From: Yusuke MINATO Date: Fri, 18 Oct 2024 16:30:23 +0900 Subject: [PATCH 009/511] [flang][Driver] Add support for -f[no-]wrapv and -f[no]-strict-overflow in the frontend (#110061) This patch introduces the options for integer overflow flags into Flang. The behavior is similar to that of Clang. --- clang/include/clang/Driver/Options.td | 11 +++--- clang/lib/Driver/ToolChains/Clang.cpp | 13 ++----- clang/lib/Driver/ToolChains/CommonArgs.cpp | 14 ++++++++ clang/lib/Driver/ToolChains/CommonArgs.h | 3 ++ clang/lib/Driver/ToolChains/Flang.cpp | 2 ++ flang/include/flang/Common/LangOptions.def | 2 ++ flang/include/flang/Common/LangOptions.h | 8 +++++ flang/include/flang/Lower/LoweringOptions.def | 5 ++- flang/lib/Frontend/CompilerInvocation.cpp | 36 +++++++++++++++++-- flang/test/Driver/frontend-forwarding.f90 | 2 ++ flang/test/Driver/integer-overflow.f90 | 10 ++++++ 11 files changed, 86 insertions(+), 20 deletions(-) create mode 100644 flang/test/Driver/integer-overflow.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4eb013d587eb..152c43d7908f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3454,7 +3454,8 @@ def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group, def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group; def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; -def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group; +def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, + Visibility<[ClangOption, FlangOption]>; def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group; def fno_temp_file : Flag<["-"], "fno-temp-file">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText< @@ -3470,7 +3471,8 @@ def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group, Visibility<[ClangOption, CC1Option]>, MarshallingInfoNegativeFlag>; def fno_working_directory : Flag<["-"], "fno-working-directory">, Group; -def fno_wrapv : Flag<["-"], "fno-wrapv">, Group; +def fno_wrapv : Flag<["-"], "fno-wrapv">, Group, + Visibility<[ClangOption, FlangOption]>; def fobjc_arc : Flag<["-"], "fobjc-arc">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Synthesize retain and release calls for Objective-C pointers">; @@ -3966,7 +3968,8 @@ defm strict_vtable_pointers : BoolFOption<"strict-vtable-pointers", "Enable optimizations based on the strict rules for" " overwriting polymorphic C++ objects">, NegFlag>; -def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group; +def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group, + Visibility<[ClangOption, FlangOption]>; def fpointer_tbaa : Flag<["-"], "fpointer-tbaa">, Group; def fdriver_only : Flag<["-"], "fdriver-only">, Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption]>, @@ -4235,7 +4238,7 @@ defm virtual_function_elimination : BoolFOption<"virtual-function-elimination", NegFlag, BothFlags<[], [ClangOption, CLOption]>>; def fwrapv : Flag<["-"], "fwrapv">, Group, - Visibility<[ClangOption, CC1Option]>, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Treat signed integer overflow as two's complement">; def fwritable_strings : Flag<["-"], "fwritable-strings">, Group, Visibility<[ClangOption, CC1Option]>, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 192eb608de43..d032fd7a59f3 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6924,16 +6924,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_ftrap_function_EQ); - // -fno-strict-overflow implies -fwrapv if it isn't disabled, but - // -fstrict-overflow won't turn off an explicitly enabled -fwrapv. - if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) { - if (A->getOption().matches(options::OPT_fwrapv)) - CmdArgs.push_back("-fwrapv"); - } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow, - options::OPT_fno_strict_overflow)) { - if (A->getOption().matches(options::OPT_fno_strict_overflow)) - CmdArgs.push_back("-fwrapv"); - } + // Handle -f[no-]wrapv and -f[no-]strict-overflow, which are used by both + // clang and flang. + renderCommonIntegerOverflowOptions(Args, CmdArgs); Args.AddLastArg(CmdArgs, options::OPT_ffinite_loops, options::OPT_fno_finite_loops); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index e662c3f0d2fa..91605a67a37f 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -3048,3 +3048,17 @@ bool tools::shouldRecordCommandLine(const ToolChain &TC, return FRecordCommandLine || TC.UseDwarfDebugFlags() || GRecordCommandLine; } + +void tools::renderCommonIntegerOverflowOptions(const ArgList &Args, + ArgStringList &CmdArgs) { + // -fno-strict-overflow implies -fwrapv if it isn't disabled, but + // -fstrict-overflow won't turn off an explicitly enabled -fwrapv. + if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) { + if (A->getOption().matches(options::OPT_fwrapv)) + CmdArgs.push_back("-fwrapv"); + } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow, + options::OPT_fno_strict_overflow)) { + if (A->getOption().matches(options::OPT_fno_strict_overflow)) + CmdArgs.push_back("-fwrapv"); + } +} diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 9cafac253886..b6ddd99b8727 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -262,6 +262,9 @@ bool shouldRecordCommandLine(const ToolChain &TC, bool &FRecordCommandLine, bool &GRecordCommandLine); +void renderCommonIntegerOverflowOptions(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); + } // end namespace tools } // end namespace driver } // end namespace clang diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index e2f8f6e0cca1..a9d2b7a4dc48 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -869,6 +869,8 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, } } + renderCommonIntegerOverflowOptions(Args, CmdArgs); + assert((Output.isFilename() || Output.isNothing()) && "Invalid output."); if (Output.isFilename()) { CmdArgs.push_back("-o"); diff --git a/flang/include/flang/Common/LangOptions.def b/flang/include/flang/Common/LangOptions.def index d3e1e972d151..1bfdba9cc2c1 100644 --- a/flang/include/flang/Common/LangOptions.def +++ b/flang/include/flang/Common/LangOptions.def @@ -20,6 +20,8 @@ LANGOPT(Name, Bits, Default) #endif ENUM_LANGOPT(FPContractMode, FPModeKind, 2, FPM_Fast) ///< FP Contract Mode (off/fast) +/// signed integer overflow handling +ENUM_LANGOPT(SignedOverflowBehavior, SignedOverflowBehaviorTy, 1, SOB_Undefined) /// Indicate a build without the standard GPU libraries. LANGOPT(NoGPULib , 1, false) diff --git a/flang/include/flang/Common/LangOptions.h b/flang/include/flang/Common/LangOptions.h index 52a45047deb0..83f25cfbe261 100644 --- a/flang/include/flang/Common/LangOptions.h +++ b/flang/include/flang/Common/LangOptions.h @@ -27,6 +27,14 @@ namespace Fortran::common { class LangOptionsBase { public: + enum SignedOverflowBehaviorTy { + // -fno-wrapv (default behavior in Flang) + SOB_Undefined, + + // -fwrapv + SOB_Defined, + }; + enum FPModeKind { // Do not fuse FP ops FPM_Off, diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index d3f17c3f939c..231de533fbd3 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -35,9 +35,8 @@ ENUM_LOWERINGOPT(NoPPCNativeVecElemOrder, unsigned, 1, 0) ENUM_LOWERINGOPT(Underscoring, unsigned, 1, 1) /// If true, assume the behavior of integer overflow is defined -/// (i.e. wraps around as two's complement). On by default. -/// TODO: make the default off -ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 1) +/// (i.e. wraps around as two's complement). Off by default. +ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0) /// If true, add nsw flags to loop variable increments. /// Off by default. diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 4607a33ffda6..94d3d1154178 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1115,6 +1115,24 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, return diags.getNumErrors() == numErrorsBefore; } +/// Parses signed integer overflow options and populates the +/// CompilerInvocation accordingly. +/// Returns false if new errors are generated. +/// +/// \param [out] invoc Stores the processed arguments +/// \param [in] args The compiler invocation arguments to parse +/// \param [out] diags DiagnosticsEngine to report erros with +static bool parseIntegerOverflowArgs(CompilerInvocation &invoc, + llvm::opt::ArgList &args, + clang::DiagnosticsEngine &diags) { + Fortran::common::LangOptions &opts = invoc.getLangOpts(); + + if (args.getLastArg(clang::driver::options::OPT_fwrapv)) + opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined); + + return true; +} + /// Parses all floating point related arguments and populates the /// CompilerInvocation accordingly. /// Returns false if new errors are generated. @@ -1255,6 +1273,18 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, return true; } +static bool parseLangOptionsArgs(CompilerInvocation &invoc, + llvm::opt::ArgList &args, + clang::DiagnosticsEngine &diags) { + bool success = true; + + success &= parseIntegerOverflowArgs(invoc, args, diags); + success &= parseFloatingPointArgs(invoc, args, diags); + success &= parseVScaleArgs(invoc, args, diags); + + return success; +} + bool CompilerInvocation::createFromArgs( CompilerInvocation &invoc, llvm::ArrayRef commandLineArgs, clang::DiagnosticsEngine &diags, const char *argv0) { @@ -1363,9 +1393,7 @@ bool CompilerInvocation::createFromArgs( invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::driver::options::OPT_mmlir); - success &= parseFloatingPointArgs(invoc, args, diags); - - success &= parseVScaleArgs(invoc, args, diags); + success &= parseLangOptionsArgs(invoc, args, diags); success &= parseLinkerOptionsArgs(invoc, args, diags); @@ -1577,6 +1605,8 @@ void CompilerInvocation::setLoweringOptions() { loweringOpts.setUnderscoring(codegenOpts.Underscoring); const Fortran::common::LangOptions &langOptions = getLangOpts(); + loweringOpts.setIntegerWrapAround(langOptions.getSignedOverflowBehavior() == + Fortran::common::LangOptions::SOB_Defined); Fortran::common::MathOptionsBase &mathOpts = loweringOpts.getMathOptions(); // TODO: when LangOptions are finalized, we can represent // the math related options using Fortran::commmon::MathOptionsBase, diff --git a/flang/test/Driver/frontend-forwarding.f90 b/flang/test/Driver/frontend-forwarding.f90 index 0a56a1e3710d..ff2d66095214 100644 --- a/flang/test/Driver/frontend-forwarding.f90 +++ b/flang/test/Driver/frontend-forwarding.f90 @@ -14,6 +14,7 @@ ! RUN: -fno-signed-zeros \ ! RUN: -fassociative-math \ ! RUN: -freciprocal-math \ +! RUN: -fno-strict-overflow \ ! RUN: -fomit-frame-pointer \ ! RUN: -fpass-plugin=Bye%pluginext \ ! RUN: -fversion-loops-for-stride \ @@ -63,4 +64,5 @@ ! CHECK: "-Rpass=inline" ! CHECK: "-mframe-pointer=none" ! CHECK: "-mllvm" "-print-before-all" +! CHECK: "-fwrapv" ! CHECK: "-save-temps=obj" diff --git a/flang/test/Driver/integer-overflow.f90 b/flang/test/Driver/integer-overflow.f90 new file mode 100644 index 000000000000..023f39fa5413 --- /dev/null +++ b/flang/test/Driver/integer-overflow.f90 @@ -0,0 +1,10 @@ +! Test for correct forwarding of integer overflow flags from the compiler driver +! to the frontend driver + +! RUN: %flang -### -fno-strict-overflow %s 2>&1 | FileCheck %s --check-prefix=INDUCED +! RUN: %flang -### -fstrict-overflow %s 2>&1 | FileCheck %s +! RUN: %flang -### -fno-wrapv %s 2>&1 | FileCheck %s +! RUN: %flang -### -fno-wrapv -fno-strict-overflow %s 2>&1 | FileCheck %s + +! CHECK-NOT: "-fno-wrapv" +! INDUCED: "-fwrapv" -- GitLab From 9d0616ce52fc2a75c8e4808adec41d5189f4240c Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 18 Oct 2024 09:45:02 +0200 Subject: [PATCH 010/511] [clang][bytecode] Ignore explicit calls to trivial dtors (#112841) This is what the current interpreter does as well. --- clang/lib/AST/ByteCode/Compiler.cpp | 4 ++++ clang/test/AST/ByteCode/placement-new.cpp | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 8ca63bf64aa0..a71c0dcc9381 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4535,6 +4535,10 @@ bool Compiler::VisitCallExpr(const CallExpr *E) { return VisitBuiltinCallExpr(E, Builtin::BI__builtin_operator_delete); } } + // Explicit calls to trivial destructors + if (const auto *DD = dyn_cast_if_present(FuncDecl); + DD && DD->isTrivial()) + return true; QualType ReturnType = E->getCallReturnType(Ctx.getASTContext()); std::optional T = classify(ReturnType); diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp index 8e6d802e9329..5673b5cba3f7 100644 --- a/clang/test/AST/ByteCode/placement-new.cpp +++ b/clang/test/AST/ByteCode/placement-new.cpp @@ -311,3 +311,16 @@ constexpr bool change_union_member() { return u.b == 2; } static_assert(change_union_member()); + +namespace PR48606 { + struct A { mutable int n = 0; }; + + constexpr bool f() { + A a; + A *p = &a; + p->~A(); + std::construct_at(p); + return true; + } + static_assert(f()); +} -- GitLab From f7f51f2afb638368ce895c01b4d9ba0eda988604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 18 Oct 2024 08:58:58 +0100 Subject: [PATCH 011/511] [mlir][vector] Clarify the semantics of masking maps (nfc) (#111383) We use the term "masking map" throughout the Linalg vectorization logic, but we don't really define what it is and how it differs from Linalg indexing maps. This PR clarifies the differnces, makes sure that the new terminology is used consistenty and improves code re-use. --- .../Linalg/Transforms/Vectorization.cpp | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 635273bcbc02..d39c5fcdbc42 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -250,6 +250,32 @@ private: LinalgOp linalgOp, std::optional maybeMaskingMap); + /// Check whether this permutation map can be used for masking. At the + /// moment we only make sure that there are no broadcast dimensions, but this + /// might change if indexing maps evolve. + bool isValidMaskingMap(AffineMap maskingMap) { + return maskingMap.getBroadcastDims().size() == 0; + } + + /// Turn the input indexing map into a valid masking map. + /// + /// The input indexing map may contain "zero" results, e.g.: + /// (d0, d1, d2, d3) -> (d2, d1, d0, 0) + /// Applying such maps to canonical vector shapes like this one: + /// (1, 16, 16, 4) + /// would yield an invalid vector shape like this: + /// (16, 16, 1, 0) + /// Instead, drop the broadcasting dims that make no sense for masking perm. + /// maps: + /// (d0, d1, d2, d3) -> (d2, d1, d0) + /// This way, the corresponding vector/mask type will be: + /// vector<16x16x1xty> + /// rather than this invalid Vector type: + /// vector<16x16x1x0xty> + AffineMap getMaskingMapFromIndexingMap(AffineMap &indexingMap) { + return indexingMap.dropZeroResults(); + } + // Holds the compile-time static sizes of the iteration space to vectorize. // Dynamic dimensions are represented using ShapedType::kDynamic. SmallVector iterSpaceStaticSizes; @@ -360,6 +386,10 @@ VectorizationState::initState(RewriterBase &rewriter, LinalgOp linalgOp, Value VectorizationState::getOrCreateMaskFor( RewriterBase &rewriter, Operation *opToMask, LinalgOp linalgOp, std::optional maybeMaskingMap) { + + assert((!maybeMaskingMap || isValidMaskingMap(*maybeMaskingMap)) && + "Ill-formed masking map."); + // No mask is needed if the operation is not maskable. auto maskableOp = dyn_cast(opToMask); if (!maskableOp) @@ -429,20 +459,8 @@ VectorizationState::maskOperation(RewriterBase &rewriter, Operation *opToMask, LDBG("Trying to mask: " << *opToMask << "\n"); std::optional maybeMaskingMap = std::nullopt; - // The Operand indexing map may contain "zero" results, e.g.: - // (d0, d1, d2, d3) -> (d0, d1, d2, 0) - // When applied to canonical vector shapes like these: - // (1, 16, 16, 4) - // we would get: - // (1, 16, 16, 0) - // Instead, we should extract the following map permutation map for masking: - // (d0, d1, d2, d3) -> (d0, d1, d2) - // This way, the corresponding vector/mask type will be: - // vector<1x16x16xty> - // rather than: - // vector<1x16x16x0xty> if (maybeIndexingMap) - maybeMaskingMap = maybeIndexingMap->dropZeroResults(); + maybeMaskingMap = getMaskingMapFromIndexingMap(*maybeIndexingMap); // Create or retrieve mask for this operation. Value mask = -- GitLab From 18ac0178ad7f839bdb3376b7a0c6b69a24520a06 Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Fri, 18 Oct 2024 09:03:27 +0100 Subject: [PATCH 012/511] Revert "[ARM] Fix frame chains with M-profile PACBTI (#110285)" Reverting because this is causing failures with MSan: https://lab.llvm.org/buildbot/#/builders/169/builds/4378 This reverts commit e1f8f84acec05997893c305c78fbf7feecf44dd7. --- llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp | 5 +- llvm/lib/Target/ARM/ARMCallingConv.td | 19 +-- llvm/lib/Target/ARM/ARMFrameLowering.cpp | 145 ++++++----------- llvm/lib/Target/ARM/ARMSubtarget.cpp | 7 - llvm/lib/Target/ARM/ARMSubtarget.h | 12 -- .../CodeGen/Thumb2/pacbti-m-frame-chain.ll | 150 ------------------ 6 files changed, 57 insertions(+), 281 deletions(-) delete mode 100644 llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index aad305cce039..3f28ce8ca4b5 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -116,12 +116,9 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_iOS_SaveList; if (PushPopSplit == ARMSubtarget::SplitR7) - return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_R7_SaveList + return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_SaveList : CSR_ATPCS_SplitPush_SaveList; - if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA) - return CSR_AAPCS_SplitPush_R11_SaveList; - return CSR_AAPCS_SaveList; } diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td index 27f175a70033..d14424c2deca 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/llvm/lib/Target/ARM/ARMCallingConv.td @@ -301,17 +301,14 @@ def CSR_ATPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush, def CSR_ATPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush, R10)>; -// Sometimes we need to split the push of the callee-saved GPRs into two -// regions, to ensure that the frame chain record is set up correctly. These -// list the callee-saved registers in the order they end up on the stack, which -// depends on whether the frame pointer is r7 or r11. -def CSR_AAPCS_SplitPush_R11 : CalleeSavedRegs<(add R10, R9, R8, R7, R6, R5, R4, - LR, R11, - (sequence "D%u", 15, 8))>; -def CSR_AAPCS_SplitPush_R7 : CalleeSavedRegs<(add LR, R11, - R7, R6, R5, R4, - R10, R9, R8, - (sequence "D%u", 15, 8))>; +// When enforcing an AAPCS compliant frame chain, R11 is used as the frame +// pointer even for Thumb targets, where split pushes are necessary. +// This AAPCS alternative makes sure the frame index slots match the push +// order in that case. +def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R11, + R7, R6, R5, R4, + R10, R9, R8, + (sequence "D%u", 15, 8))>; // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' // and the pointer return value are both passed in R0 in these cases, this can diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 82b6f808688e..e0703457aa81 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -199,11 +199,6 @@ SpillArea getSpillArea(Register Reg, // push {r0-r10, r12} GPRCS1 // vpush {r8-d15} DPRCS1 // push {r11, lr} GPRCS2 - // - // SplitR11AAPCSSignRA: - // push {r0-r10, r12} GPRSC1 - // push {r11, lr} GPRCS2 - // vpush {r8-d15} DPRCS1 // If FPCXTNS is spilled (for CMSE secure entryfunctions), it is always at // the top of the stack frame. @@ -251,8 +246,7 @@ SpillArea getSpillArea(Register Reg, return SpillArea::GPRCS1; case ARM::LR: - if (Variation == ARMSubtarget::SplitR11WindowsSEH || - Variation == ARMSubtarget::SplitR11AAPCSSignRA) + if (Variation == ARMSubtarget::SplitR11WindowsSEH) return SpillArea::GPRCS2; else return SpillArea::GPRCS1; @@ -869,9 +863,6 @@ static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI, // This is a conservative estimation: Assume the frame pointer being r7 and // pc("r15") up to r8 getting spilled before (= 8 registers). int MaxRegBytes = 8 * 4; - if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA) - // Here, r11 can be stored below all of r4-r15. - MaxRegBytes = 11 * 4; if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { // Here, r11 can be stored below all of r4-r15 plus d8-d15. MaxRegBytes = 11 * 4 + 8 * 8; @@ -944,23 +935,17 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } // Determine spill area sizes, and some important frame indices. - SpillArea FramePtrSpillArea; - bool BeforeFPPush = true; for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); int FI = I.getFrameIdx(); - SpillArea Area = getSpillArea(Reg, PushPopSplit, - AFI->getNumAlignedDPRCS2Regs(), RegInfo); - - if (Reg == FramePtr) { + if (Reg == FramePtr) FramePtrSpillFI = FI; - FramePtrSpillArea = Area; - } if (Reg == ARM::D8) D8SpillFI = FI; - switch (Area) { + switch (getSpillArea(Reg, PushPopSplit, AFI->getNumAlignedDPRCS2Regs(), + RegInfo)) { case SpillArea::FPCXT: FPCXTSaveSize += 4; break; @@ -987,7 +972,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Move past FPCXT area. if (FPCXTSaveSize > 0) { LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, BeforeFPPush); + DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true); } // Allocate the vararg register save area. @@ -995,15 +980,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, MachineInstr::FrameSetup); LastPush = std::prev(MBBI); - DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, BeforeFPPush); + DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true); } // Move past area 1. if (GPRCS1Size > 0) { GPRCS1Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, BeforeFPPush); - if (FramePtrSpillArea == SpillArea::GPRCS1) - BeforeFPPush = false; + DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true); } // Determine starting offsets of spill areas. These offsets are all positive @@ -1027,6 +1010,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } else { DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; } + int FramePtrOffsetInPush = 0; if (HasFP) { // Offset from the CFA to the saved frame pointer, will be negative. [[maybe_unused]] int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); @@ -1034,6 +1018,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, << ", FPOffset: " << FPOffset << "\n"); assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset && "Max FP estimation is wrong"); + // Offset from the top of the GPRCS1 area to the saved frame pointer, will + // be negative. + FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize; + LLVM_DEBUG(dbgs() << "FramePtrOffsetInPush=" << FramePtrOffsetInPush + << ", FramePtrSpillOffset=" + << (MFI.getObjectOffset(FramePtrSpillFI) + NumBytes) + << "\n"); AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) + NumBytes); } @@ -1045,9 +1036,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // after DPRCS1. if (GPRCS2Size > 0 && PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) { GPRCS2Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush); - if (FramePtrSpillArea == SpillArea::GPRCS2) - BeforeFPPush = false; + DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); } // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our @@ -1060,7 +1049,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, MachineInstr::FrameSetup); - DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize, BeforeFPPush); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize); } } @@ -1069,8 +1058,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) { - DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI), - BeforeFPPush); + DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI)); LastPush = MBBI++; } } @@ -1089,9 +1077,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Move GPRCS2, if using using SplitR11WindowsSEH. if (GPRCS2Size > 0 && PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { GPRCS2Push = LastPush = MBBI++; - DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush); - if (FramePtrSpillArea == SpillArea::GPRCS2) - BeforeFPPush = false; + DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); } bool NeedsWinCFIStackAlloc = NeedsWinCFI; @@ -1192,51 +1178,28 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // into spill area 1, including the FP in R11. In either case, it // is in area one and the adjustment needs to take place just after // that push. + // FIXME: The above is not necessary true when PACBTI is enabled. + // AAPCS requires use of R11, and PACBTI gets in the way of regular pushes, + // so FP ends up on area two. MachineBasicBlock::iterator AfterPush; if (HasFP) { - MachineBasicBlock::iterator FPPushInst; - // Offset from SP immediately after the push which saved the FP to the FP - // save slot. - int64_t FPOffsetAfterPush; - switch (FramePtrSpillArea) { - case SpillArea::GPRCS1: - FPPushInst = GPRCS1Push; - FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) + - ArgRegsSaveSize + FPCXTSaveSize + - sizeOfSPAdjustment(*FPPushInst); - LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS1, offset " - << FPOffsetAfterPush << " after that push\n"); - break; - case SpillArea::GPRCS2: - FPPushInst = GPRCS2Push; - FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) + - ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size + - sizeOfSPAdjustment(*FPPushInst); - if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) - FPOffsetAfterPush += DPRCSSize + DPRGapSize; - LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS2, offset " - << FPOffsetAfterPush << " after that push\n"); - break; - default: - llvm_unreachable("frame pointer in unknown spill area"); - break; + AfterPush = std::next(GPRCS1Push); + unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push); + int FPOffset = PushSize + FramePtrOffsetInPush; + if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) { + AfterPush = std::next(GPRCS2Push); + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, + FramePtr, ARM::SP, 0, MachineInstr::FrameSetup); + } else { + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, + FramePtr, ARM::SP, FPOffset, + MachineInstr::FrameSetup); } - AfterPush = std::next(FPPushInst); - if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) - assert(FPOffsetAfterPush == 0); - - // Emit the MOV or ADD to set up the frame pointer register. - emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII, - FramePtr, ARM::SP, FPOffsetAfterPush, - MachineInstr::FrameSetup); - if (!NeedsWinCFI) { - // Emit DWARF info to find the CFA using the frame pointer from this - // point onward. - if (FPOffsetAfterPush != 0) { + if (FramePtrOffsetInPush + PushSize != 0) { unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( nullptr, MRI->getDwarfRegNum(FramePtr, true), - -MFI.getObjectOffset(FramePtrSpillFI))); + FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush)); BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1749,8 +1712,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 && STI.hasV5TOps() && MBB.succ_empty() && !hasPAC && - (PushPopSplit != ARMSubtarget::SplitR11WindowsSEH && - PushPopSplit != ARMSubtarget::SplitR11AAPCSSignRA)) { + PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) { Reg = ARM::PC; // Fold the return instruction into the LDM. DeleteRet = true; @@ -2983,29 +2945,18 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots( const auto &AFI = *MF.getInfo(); if (AFI.shouldSignReturnAddress()) { // The order of register must match the order we push them, because the - // PEI assigns frame indices in that order. That order depends on the - // PushPopSplitVariation, there are only two cases which we use with return - // address signing: - switch (STI.getPushPopSplitVariation(MF)) { - case ARMSubtarget::SplitR7: - // LR, R7, R6, R5, R4, , R11, R10, R9, R8, D15-D8 - CSI.insert(find_if(CSI, - [=](const auto &CS) { - Register Reg = CS.getReg(); - return Reg == ARM::R10 || Reg == ARM::R11 || - Reg == ARM::R8 || Reg == ARM::R9 || - ARM::DPRRegClass.contains(Reg); - }), - CalleeSavedInfo(ARM::R12)); - break; - case ARMSubtarget::SplitR11AAPCSSignRA: - // With SplitR11AAPCSSignRA, R12 will always be the highest-addressed CSR - // on the stack. - CSI.insert(CSI.begin(), CalleeSavedInfo(ARM::R12)); - break; - default: - llvm_unreachable("Unexpected CSR split with return address signing"); - } + // PEI assigns frame indices in that order. When compiling for return + // address sign and authenication, we use split push, therefore the orders + // we want are: + // LR, R7, R6, R5, R4, , R11, R10, R9, R8, D15-D8 + CSI.insert(find_if(CSI, + [=](const auto &CS) { + Register Reg = CS.getReg(); + return Reg == ARM::R10 || Reg == ARM::R11 || + Reg == ARM::R8 || Reg == ARM::R9 || + ARM::DPRRegClass.contains(Reg); + }), + CalleeSavedInfo(ARM::R12)); } return false; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 9adfb1fab5f0..c4a782bc4091 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -514,12 +514,5 @@ ARMSubtarget::getPushPopSplitVariation(const MachineFunction &MF) const { F.needsUnwindTableEntry() && (MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF))) return SplitR11WindowsSEH; - - // Returns R11SplitAAPCSBranchSigning if R11 and lr are not adjacent to each - // other in the list of callee saved registers in a frame, and branch - // signing is enabled. - if (MF.getInfo()->shouldSignReturnAddress() && - getFramePointerReg() == ARM::R11) - return SplitR11AAPCSSignRA; return NoSplit; } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 214c5f1b45e5..7917ddc17bdb 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -105,18 +105,6 @@ public: /// vpush {d8-d15} /// push {r11, lr} SplitR11WindowsSEH, - - /// When generating AAPCS-compilant frame chains, R11 is the frame pointer, - /// and must be pushed adjacent to the return address (LR). Normally this - /// isn't a problem, because the only register between them is r12, which is - /// the intra-procedure-call scratch register, so doesn't need to be saved. - /// However, when PACBTI is in use, r12 contains the authentication code, so - /// does need to be saved. This means that we need a separate push for R11 - /// and LR. - /// push {r0-r10, r12} - /// push {r11, lr} - /// vpush {d8-d15} - SplitR11AAPCSSignRA, }; protected: diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll deleted file mode 100644 index 8bcf87130c54..000000000000 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll +++ /dev/null @@ -1,150 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=thumbv8.1m.main-none-eabi < %s --force-dwarf-frame-section -frame-pointer=all -mattr=+aapcs-frame-chain | FileCheck %s - -; int test1() { -; return 0; -; } -define i32 @test1() "sign-return-address"="non-leaf" { -; CHECK-LABEL: test1: -; CHECK: .cfi_sections .debug_frame -; CHECK-NEXT: .cfi_startproc -; CHECK-NEXT: @ %bb.0: @ %entry -; CHECK-NEXT: pac r12, lr, sp -; CHECK-NEXT: .save {ra_auth_code} -; CHECK-NEXT: str r12, [sp, #-4]! -; CHECK-NEXT: .cfi_def_cfa_offset 4 -; CHECK-NEXT: .cfi_offset ra_auth_code, -4 -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push.w {r11, lr} -; CHECK-NEXT: .cfi_def_cfa_offset 12 -; CHECK-NEXT: .cfi_offset lr, -8 -; CHECK-NEXT: .cfi_offset r11, -12 -; CHECK-NEXT: .setfp r11, sp -; CHECK-NEXT: mov r11, sp -; CHECK-NEXT: .cfi_def_cfa_register r11 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: pop.w {r11, lr} -; CHECK-NEXT: ldr r12, [sp], #4 -; CHECK-NEXT: aut r12, lr, sp -; CHECK-NEXT: bx lr -entry: - ret i32 0 -} - -; void foo(int n) { -; int a[n]; -; bar(a); -; } -define dso_local void @test2(i32 noundef %n) "sign-return-address"="non-leaf" { -; CHECK-LABEL: test2: -; CHECK: .cfi_startproc -; CHECK-NEXT: @ %bb.0: @ %entry -; CHECK-NEXT: pac r12, lr, sp -; CHECK-NEXT: .save {r4, r7, ra_auth_code} -; CHECK-NEXT: push.w {r4, r7, r12} -; CHECK-NEXT: .cfi_def_cfa_offset 12 -; CHECK-NEXT: .cfi_offset ra_auth_code, -4 -; CHECK-NEXT: .cfi_offset r7, -8 -; CHECK-NEXT: .cfi_offset r4, -12 -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push.w {r11, lr} -; CHECK-NEXT: .cfi_def_cfa_offset 20 -; CHECK-NEXT: .cfi_offset lr, -16 -; CHECK-NEXT: .cfi_offset r11, -20 -; CHECK-NEXT: .setfp r11, sp -; CHECK-NEXT: mov r11, sp -; CHECK-NEXT: .cfi_def_cfa_register r11 -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: movs r1, #7 -; CHECK-NEXT: add.w r0, r1, r0, lsl #2 -; CHECK-NEXT: bic r0, r0, #7 -; CHECK-NEXT: sub.w r0, sp, r0 -; CHECK-NEXT: mov sp, r0 -; CHECK-NEXT: bl take_ptr -; CHECK-NEXT: mov sp, r11 -; CHECK-NEXT: pop.w {r11, lr} -; CHECK-NEXT: pop.w {r4, r7, r12} -; CHECK-NEXT: aut r12, lr, sp -; CHECK-NEXT: bx lr -entry: - %vla = alloca i32, i32 %n, align 4 - call void @take_ptr(ptr noundef nonnull %vla) - ret void -} - -; void test3(int c, float e, int z) { -; if (c) -; knr(); -; take_ptr(alloca(z)); -; if (e) -; knr(); -; } -define void @test3(i32 noundef %c, float noundef %e, i32 noundef %z) "sign-return-address"="non-leaf" { -; CHECK-LABEL: test3: -; CHECK: .cfi_startproc -; CHECK-NEXT: @ %bb.0: @ %entry -; CHECK-NEXT: pac r12, lr, sp -; CHECK-NEXT: .save {r4, r5, r6, r7, ra_auth_code} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r12} -; CHECK-NEXT: .cfi_def_cfa_offset 20 -; CHECK-NEXT: .cfi_offset ra_auth_code, -4 -; CHECK-NEXT: .cfi_offset r7, -8 -; CHECK-NEXT: .cfi_offset r6, -12 -; CHECK-NEXT: .cfi_offset r5, -16 -; CHECK-NEXT: .cfi_offset r4, -20 -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push.w {r11, lr} -; CHECK-NEXT: .cfi_def_cfa_offset 28 -; CHECK-NEXT: .cfi_offset lr, -24 -; CHECK-NEXT: .cfi_offset r11, -28 -; CHECK-NEXT: .setfp r11, sp -; CHECK-NEXT: mov r11, sp -; CHECK-NEXT: .cfi_def_cfa_register r11 -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: it ne -; CHECK-NEXT: blne knr -; CHECK-NEXT: adds r0, r5, #7 -; CHECK-NEXT: bic r0, r0, #7 -; CHECK-NEXT: sub.w r0, sp, r0 -; CHECK-NEXT: mov sp, r0 -; CHECK-NEXT: bl take_ptr -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: bl __aeabi_fcmpeq -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: bleq knr -; CHECK-NEXT: mov sp, r11 -; CHECK-NEXT: pop.w {r11, lr} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r12} -; CHECK-NEXT: aut r12, lr, sp -; CHECK-NEXT: bx lr -entry: - %tobool.not = icmp eq i32 %c, 0 - br i1 %tobool.not, label %if.end, label %if.then - -if.then: ; preds = %entry - tail call void @knr() - br label %if.end - -if.end: ; preds = %if.then, %entry - %0 = alloca i8, i32 %z, align 8 - call void @take_ptr(ptr noundef nonnull %0) - %tobool1 = fcmp une float %e, 0.000000e+00 - br i1 %tobool1, label %if.then2, label %if.end3 - -if.then2: ; preds = %if.end - call void @knr() - br label %if.end3 - -if.end3: ; preds = %if.then2, %if.end - ret void -} - -declare void @knr(...) -declare void @take_ptr(ptr noundef) -- GitLab From 508fd966fb00428ccd1dd7ddeb636fb7393029ec Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Fri, 18 Oct 2024 09:10:17 +0100 Subject: [PATCH 013/511] [CLANG][AArch64]Add SVE tuple types for mfloat8_t (#112687) This patch adds scalable tuple types vectors for MFloat_8 type, according to the ACLE[1]. [1] https://github.com/ARM-software/acle.git --- .../clang/Basic/AArch64SVEACLETypes.def | 6 ++ .../include/clang/Serialization/ASTBitCodes.h | 2 +- .../CodeGenCXX/aarch64-mangle-sve-vectors.cpp | 86 ++++++++++++++++++- .../CodeGenCXX/aarch64-sve-vector-init.cpp | 70 +++++++++++++++ clang/test/Modules/no-external-type-id.cppm | 2 +- clang/utils/TableGen/SveEmitter.cpp | 4 + 6 files changed, 167 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index 72df1e35aaec..2b80e43b5063 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -138,6 +138,8 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2) +SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false) + // // x3 // @@ -158,6 +160,8 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3) +SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false) + // // x4 // @@ -178,6 +182,8 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4) +SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false) + SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1) SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2) SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 16, 4) diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index d735e2dcaa8c..e397dff09765 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1149,7 +1149,7 @@ enum PredefinedTypeIDs { /// /// Type IDs for non-predefined types will start at /// NUM_PREDEF_TYPE_IDs. -const unsigned NUM_PREDEF_TYPE_IDS = 506; +const unsigned NUM_PREDEF_TYPE_IDS = 509; // Ensure we do not overrun the predefined types we reserved // in the enum PredefinedTypeIDs above. diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp index 3f2b0622d551..9f481e1f0f08 100644 --- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp +++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp @@ -59,6 +59,9 @@ void f(__clang_svbfloat16x3_t, __clang_svbfloat16x3_t); void f(__clang_svbfloat16x4_t, __clang_svbfloat16x4_t); void f(__clang_svboolx2_t, __clang_svboolx2_t); void f(__clang_svboolx4_t, __clang_svboolx4_t); +void f(__clang_svmfloat8x2_t, __clang_svmfloat8x2_t); +void f(__clang_svmfloat8x3_t, __clang_svmfloat8x3_t); +void f(__clang_svmfloat8x4_t, __clang_svmfloat8x4_t); // CHECK-LABEL: define dso_local void @_Z3foov( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -139,6 +142,12 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // CHECK-NEXT: [[COERCE73:%.*]] = alloca { , }, align 2 // CHECK-NEXT: [[COERCE74:%.*]] = alloca { , , , }, align 2 // CHECK-NEXT: [[COERCE75:%.*]] = alloca { , , , }, align 2 +// CHECK-NEXT: [[COERCE76:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[COERCE77:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[COERCE78:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[COERCE79:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[COERCE80:%.*]] = alloca { , , , }, align 16 +// CHECK-NEXT: [[COERCE81:%.*]] = alloca { , , , }, align 16 // CHECK-NEXT: call void @_Z1fu10__SVInt8_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu11__SVInt16_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu11__SVInt16_tS_( zeroinitializer, zeroinitializer) @@ -151,7 +160,7 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // CHECK-NEXT: call void @_Z1fu13__SVFloat16_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu13__SVFloat32_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu13__SVFloat64_tS_( zeroinitializer, zeroinitializer) -// CHECK-NEXT: call void @_Z1fu13__SVMfloat8_tS_( zeroinitializer, zeroinitializer) +// CHECK-NEXT: call void @_Z1fu13__SVMfloat8_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu14__SVBfloat16_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu10__SVBool_tS_( zeroinitializer, zeroinitializer) // CHECK-NEXT: call void @_Z1fu11__SVCount_tS_(target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer) @@ -573,6 +582,39 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // CHECK-NEXT: [[COERCE75_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE75_TUPLE]], 2 // CHECK-NEXT: [[COERCE75_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE75_TUPLE]], 3 // CHECK-NEXT: call void @_Z1f10svboolx4_tS_( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], [[COERCE75_EXTRACT0]], [[COERCE75_EXTRACT1]], [[COERCE75_EXTRACT2]], [[COERCE75_EXTRACT3]]) +// CHECK-NEXT: store { , } zeroinitializer, ptr [[COERCE76]], align 16 +// CHECK-NEXT: [[COERCE76_TUPLE:%.*]] = load { , }, ptr [[COERCE76]], align 16 +// CHECK-NEXT: [[COERCE76_EXTRACT0:%.*]] = extractvalue { , } [[COERCE76_TUPLE]], 0 +// CHECK-NEXT: [[COERCE76_EXTRACT1:%.*]] = extractvalue { , } [[COERCE76_TUPLE]], 1 +// CHECK-NEXT: store { , } zeroinitializer, ptr [[COERCE77]], align 16 +// CHECK-NEXT: [[COERCE77_TUPLE:%.*]] = load { , }, ptr [[COERCE77]], align 16 +// CHECK-NEXT: [[COERCE77_EXTRACT0:%.*]] = extractvalue { , } [[COERCE77_TUPLE]], 0 +// CHECK-NEXT: [[COERCE77_EXTRACT1:%.*]] = extractvalue { , } [[COERCE77_TUPLE]], 1 +// CHECK-NEXT: call void @_Z1f13svmfloat8x2_tS_( [[COERCE76_EXTRACT0]], [[COERCE76_EXTRACT1]], [[COERCE77_EXTRACT0]], [[COERCE77_EXTRACT1]]) +// CHECK-NEXT: store { , , } zeroinitializer, ptr [[COERCE78]], align 16 +// CHECK-NEXT: [[COERCE78_TUPLE:%.*]] = load { , , }, ptr [[COERCE78]], align 16 +// CHECK-NEXT: [[COERCE78_EXTRACT0:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 0 +// CHECK-NEXT: [[COERCE78_EXTRACT1:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 1 +// CHECK-NEXT: [[COERCE78_EXTRACT2:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 2 +// CHECK-NEXT: store { , , } zeroinitializer, ptr [[COERCE79]], align 16 +// CHECK-NEXT: [[COERCE79_TUPLE:%.*]] = load { , , }, ptr [[COERCE79]], align 16 +// CHECK-NEXT: [[COERCE79_EXTRACT0:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 0 +// CHECK-NEXT: [[COERCE79_EXTRACT1:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 1 +// CHECK-NEXT: [[COERCE79_EXTRACT2:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 2 +// CHECK-NEXT: call void @_Z1f13svmfloat8x3_tS_( [[COERCE78_EXTRACT0]], [[COERCE78_EXTRACT1]], [[COERCE78_EXTRACT2]], [[COERCE79_EXTRACT0]], [[COERCE79_EXTRACT1]], [[COERCE79_EXTRACT2]]) +// CHECK-NEXT: store { , , , } zeroinitializer, ptr [[COERCE80]], align 16 +// CHECK-NEXT: [[COERCE80_TUPLE:%.*]] = load { , , , }, ptr [[COERCE80]], align 16 +// CHECK-NEXT: [[COERCE80_EXTRACT0:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 0 +// CHECK-NEXT: [[COERCE80_EXTRACT1:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 1 +// CHECK-NEXT: [[COERCE80_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 2 +// CHECK-NEXT: [[COERCE80_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 3 +// CHECK-NEXT: store { , , , } zeroinitializer, ptr [[COERCE81]], align 16 +// CHECK-NEXT: [[COERCE81_TUPLE:%.*]] = load { , , , }, ptr [[COERCE81]], align 16 +// CHECK-NEXT: [[COERCE81_EXTRACT0:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 0 +// CHECK-NEXT: [[COERCE81_EXTRACT1:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 1 +// CHECK-NEXT: [[COERCE81_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 2 +// CHECK-NEXT: [[COERCE81_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 3 +// CHECK-NEXT: call void @_Z1f13svmfloat8x4_tS_( [[COERCE80_EXTRACT0]], [[COERCE80_EXTRACT1]], [[COERCE80_EXTRACT2]], [[COERCE80_EXTRACT3]], [[COERCE81_EXTRACT0]], [[COERCE81_EXTRACT1]], [[COERCE81_EXTRACT2]], [[COERCE81_EXTRACT3]]) // CHECK-NEXT: ret void // // COMPAT_17-LABEL: define dso_local void @_Z3foov( @@ -654,6 +696,12 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // COMPAT_17-NEXT: [[COERCE73:%.*]] = alloca { , }, align 2 // COMPAT_17-NEXT: [[COERCE74:%.*]] = alloca { , , , }, align 2 // COMPAT_17-NEXT: [[COERCE75:%.*]] = alloca { , , , }, align 2 +// COMPAT_17-NEXT: [[COERCE76:%.*]] = alloca { , }, align 16 +// COMPAT_17-NEXT: [[COERCE77:%.*]] = alloca { , }, align 16 +// COMPAT_17-NEXT: [[COERCE78:%.*]] = alloca { , , }, align 16 +// COMPAT_17-NEXT: [[COERCE79:%.*]] = alloca { , , }, align 16 +// COMPAT_17-NEXT: [[COERCE80:%.*]] = alloca { , , , }, align 16 +// COMPAT_17-NEXT: [[COERCE81:%.*]] = alloca { , , , }, align 16 // COMPAT_17-NEXT: call void @_Z1fu10__SVInt8_tu10__SVInt8_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu11__SVInt16_tu11__SVInt16_t( zeroinitializer, zeroinitializer) // COMPAT_17-NEXT: call void @_Z1fu11__SVInt16_tu11__SVInt16_t( zeroinitializer, zeroinitializer) @@ -1088,6 +1136,39 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t); // COMPAT_17-NEXT: [[COERCE75_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE75_TUPLE]], 2 // COMPAT_17-NEXT: [[COERCE75_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE75_TUPLE]], 3 // COMPAT_17-NEXT: call void @_Z1f10svboolx4_t10svboolx4_t( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], [[COERCE75_EXTRACT0]], [[COERCE75_EXTRACT1]], [[COERCE75_EXTRACT2]], [[COERCE75_EXTRACT3]]) +// COMPAT_17-NEXT: store { , } zeroinitializer, ptr [[COERCE76]], align 16 +// COMPAT_17-NEXT: [[COERCE76_TUPLE:%.*]] = load { , }, ptr [[COERCE76]], align 16 +// COMPAT_17-NEXT: [[COERCE76_EXTRACT0:%.*]] = extractvalue { , } [[COERCE76_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE76_EXTRACT1:%.*]] = extractvalue { , } [[COERCE76_TUPLE]], 1 +// COMPAT_17-NEXT: store { , } zeroinitializer, ptr [[COERCE77]], align 16 +// COMPAT_17-NEXT: [[COERCE77_TUPLE:%.*]] = load { , }, ptr [[COERCE77]], align 16 +// COMPAT_17-NEXT: [[COERCE77_EXTRACT0:%.*]] = extractvalue { , } [[COERCE77_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE77_EXTRACT1:%.*]] = extractvalue { , } [[COERCE77_TUPLE]], 1 +// COMPAT_17-NEXT: call void @_Z1f13svmfloat8x2_t13svmfloat8x2_t( [[COERCE76_EXTRACT0]], [[COERCE76_EXTRACT1]], [[COERCE77_EXTRACT0]], [[COERCE77_EXTRACT1]]) +// COMPAT_17-NEXT: store { , , } zeroinitializer, ptr [[COERCE78]], align 16 +// COMPAT_17-NEXT: [[COERCE78_TUPLE:%.*]] = load { , , }, ptr [[COERCE78]], align 16 +// COMPAT_17-NEXT: [[COERCE78_EXTRACT0:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE78_EXTRACT1:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 1 +// COMPAT_17-NEXT: [[COERCE78_EXTRACT2:%.*]] = extractvalue { , , } [[COERCE78_TUPLE]], 2 +// COMPAT_17-NEXT: store { , , } zeroinitializer, ptr [[COERCE79]], align 16 +// COMPAT_17-NEXT: [[COERCE79_TUPLE:%.*]] = load { , , }, ptr [[COERCE79]], align 16 +// COMPAT_17-NEXT: [[COERCE79_EXTRACT0:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE79_EXTRACT1:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 1 +// COMPAT_17-NEXT: [[COERCE79_EXTRACT2:%.*]] = extractvalue { , , } [[COERCE79_TUPLE]], 2 +// COMPAT_17-NEXT: call void @_Z1f13svmfloat8x3_t13svmfloat8x3_t( [[COERCE78_EXTRACT0]], [[COERCE78_EXTRACT1]], [[COERCE78_EXTRACT2]], [[COERCE79_EXTRACT0]], [[COERCE79_EXTRACT1]], [[COERCE79_EXTRACT2]]) +// COMPAT_17-NEXT: store { , , , } zeroinitializer, ptr [[COERCE80]], align 16 +// COMPAT_17-NEXT: [[COERCE80_TUPLE:%.*]] = load { , , , }, ptr [[COERCE80]], align 16 +// COMPAT_17-NEXT: [[COERCE80_EXTRACT0:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE80_EXTRACT1:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 1 +// COMPAT_17-NEXT: [[COERCE80_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 2 +// COMPAT_17-NEXT: [[COERCE80_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE80_TUPLE]], 3 +// COMPAT_17-NEXT: store { , , , } zeroinitializer, ptr [[COERCE81]], align 16 +// COMPAT_17-NEXT: [[COERCE81_TUPLE:%.*]] = load { , , , }, ptr [[COERCE81]], align 16 +// COMPAT_17-NEXT: [[COERCE81_EXTRACT0:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 0 +// COMPAT_17-NEXT: [[COERCE81_EXTRACT1:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 1 +// COMPAT_17-NEXT: [[COERCE81_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 2 +// COMPAT_17-NEXT: [[COERCE81_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE81_TUPLE]], 3 +// COMPAT_17-NEXT: call void @_Z1f13svmfloat8x4_t13svmfloat8x4_t( [[COERCE80_EXTRACT0]], [[COERCE80_EXTRACT1]], [[COERCE80_EXTRACT2]], [[COERCE80_EXTRACT3]], [[COERCE81_EXTRACT0]], [[COERCE81_EXTRACT1]], [[COERCE81_EXTRACT2]], [[COERCE81_EXTRACT3]]) // COMPAT_17-NEXT: ret void // void foo() { @@ -1146,4 +1227,7 @@ void foo() { f(__clang_svbfloat16x4_t(), __clang_svbfloat16x4_t()); f(__clang_svboolx2_t(), __clang_svboolx2_t()); f(__clang_svboolx4_t(), __clang_svboolx4_t()); + f(__clang_svmfloat8x2_t(), __clang_svmfloat8x2_t()); + f(__clang_svmfloat8x3_t(), __clang_svmfloat8x3_t()); + f(__clang_svmfloat8x4_t(), __clang_svmfloat8x4_t()); } diff --git a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp index 45cf8081eb3a..f9068364d0dc 100644 --- a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp +++ b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp @@ -57,6 +57,9 @@ // CHECK-NEXT: [[B8X2:%.*]] = alloca { , }, align 2 // CHECK-NEXT: [[B8X4:%.*]] = alloca { , , , }, align 2 // CHECK-NEXT: [[CNT:%.*]] = alloca target("aarch64.svcount"), align 2 +// CHECK-NEXT: [[MF8X2:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[MF8X3:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[MF8X4:%.*]] = alloca { , , , }, align 16 // CHECK-NEXT: store zeroinitializer, ptr [[S8]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[S16]], align 16 // CHECK-NEXT: store zeroinitializer, ptr [[S32]], align 16 @@ -110,6 +113,9 @@ // CHECK-NEXT: store { , } zeroinitializer, ptr [[B8X2]], align 2 // CHECK-NEXT: store { , , , } zeroinitializer, ptr [[B8X4]], align 2 // CHECK-NEXT: store target("aarch64.svcount") zeroinitializer, ptr [[CNT]], align 2 +// CHECK-NEXT: store { , } zeroinitializer, ptr [[MF8X2]], align 16 +// CHECK-NEXT: store { , , } zeroinitializer, ptr [[MF8X3]], align 16 +// CHECK-NEXT: store { , , , } zeroinitializer, ptr [[MF8X4]], align 16 // CHECK-NEXT: ret void // void test_locals(void) { @@ -171,6 +177,10 @@ void test_locals(void) { __clang_svboolx4_t b8x4{}; __SVCount_t cnt{}; + + __clang_svmfloat8x2_t mf8x2{}; + __clang_svmfloat8x3_t mf8x3{}; + __clang_svmfloat8x4_t mf8x4{}; } // CHECK-LABEL: define dso_local void @_Z12test_copy_s8u10__SVInt8_t @@ -1142,3 +1152,63 @@ void test_copy_b8x4(__clang_svboolx4_t a) { void test_copy_cnt(__SVCount_t a) { __SVCount_t b{a}; } + +// CHECK-LABEL: define dso_local void @_Z15test_copy_mf8x213svmfloat8x2_t +// CHECK-SAME: ( [[A_COERCE0:%.*]], [[A_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[B:%.*]] = alloca { , }, align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[A_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[A_COERCE1]], 1 +// CHECK-NEXT: store { , } [[TMP1]], ptr [[A]], align 16 +// CHECK-NEXT: [[A1:%.*]] = load { , }, ptr [[A]], align 16 +// CHECK-NEXT: store { , } [[A1]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load { , }, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store { , } [[TMP2]], ptr [[B]], align 16 +// CHECK-NEXT: ret void +// +void test_copy_mf8x2(__clang_svmfloat8x2_t a) { + __clang_svmfloat8x2_t b{a}; +} + +// CHECK-LABEL: define dso_local void @_Z15test_copy_mf8x313svmfloat8x3_t +// CHECK-SAME: ( [[A_COERCE0:%.*]], [[A_COERCE1:%.*]], [[A_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[B:%.*]] = alloca { , , }, align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[A_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[A_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[A_COERCE2]], 2 +// CHECK-NEXT: store { , , } [[TMP2]], ptr [[A]], align 16 +// CHECK-NEXT: [[A1:%.*]] = load { , , }, ptr [[A]], align 16 +// CHECK-NEXT: store { , , } [[A1]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load { , , }, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store { , , } [[TMP3]], ptr [[B]], align 16 +// CHECK-NEXT: ret void +// +void test_copy_mf8x3(__clang_svmfloat8x3_t a) { + __clang_svmfloat8x3_t b{a}; +} + +// CHECK-LABEL: define dso_local void @_Z15test_copy_mf8x413svmfloat8x4_t +// CHECK-SAME: ( [[A_COERCE0:%.*]], [[A_COERCE1:%.*]], [[A_COERCE2:%.*]], [[A_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A:%.*]] = alloca { , , , }, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca { , , , }, align 16 +// CHECK-NEXT: [[B:%.*]] = alloca { , , , }, align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[A_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[A_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[A_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[A_COERCE3]], 3 +// CHECK-NEXT: store { , , , } [[TMP3]], ptr [[A]], align 16 +// CHECK-NEXT: [[A1:%.*]] = load { , , , }, ptr [[A]], align 16 +// CHECK-NEXT: store { , , , } [[A1]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load { , , , }, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store { , , , } [[TMP4]], ptr [[B]], align 16 +// CHECK-NEXT: ret void +// +void test_copy_mf8x4(__clang_svmfloat8x4_t a) { + __clang_svmfloat8x4_t b{a}; +} diff --git a/clang/test/Modules/no-external-type-id.cppm b/clang/test/Modules/no-external-type-id.cppm index a4ca389739fb..577b97f5930e 100644 --- a/clang/test/Modules/no-external-type-id.cppm +++ b/clang/test/Modules/no-external-type-id.cppm @@ -23,7 +23,7 @@ export module b; import a; export int b(); -// CHECK: Date: Fri, 18 Oct 2024 10:11:14 +0200 Subject: [PATCH 014/511] [clang] Deduplicate the logic that only warns once when stack is almost full (#112552) Zero diff in behavior. --- .../clang/Basic/StackExhaustionHandler.h | 45 +++++++++++++++++++ clang/include/clang/Sema/Sema.h | 6 +-- clang/include/clang/Serialization/ASTReader.h | 6 ++- clang/lib/Basic/CMakeLists.txt | 1 + clang/lib/Basic/StackExhaustionHandler.cpp | 35 +++++++++++++++ clang/lib/CodeGen/CodeGenModule.cpp | 12 +---- clang/lib/CodeGen/CodeGenModule.h | 6 +-- clang/lib/Sema/Sema.cpp | 12 +---- clang/lib/Sema/SemaTemplateInstantiate.cpp | 3 +- clang/lib/Serialization/ASTReader.cpp | 21 +++++---- clang/lib/Serialization/ASTReaderDecl.cpp | 3 +- 11 files changed, 105 insertions(+), 45 deletions(-) create mode 100644 clang/include/clang/Basic/StackExhaustionHandler.h create mode 100644 clang/lib/Basic/StackExhaustionHandler.cpp diff --git a/clang/include/clang/Basic/StackExhaustionHandler.h b/clang/include/clang/Basic/StackExhaustionHandler.h new file mode 100644 index 000000000000..fb02b9521cb4 --- /dev/null +++ b/clang/include/clang/Basic/StackExhaustionHandler.h @@ -0,0 +1,45 @@ +//===--- StackExhaustionHandler.h - A utility for warning once when close to out +// of stack space -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines a utilitiy for warning once when close to out of stack space. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_BASIC_STACK_EXHAUSTION_HANDLER_H +#define LLVM_CLANG_BASIC_STACK_EXHAUSTION_HANDLER_H + +#include "clang/Basic/Diagnostic.h" + +namespace clang { +class StackExhaustionHandler { +public: + StackExhaustionHandler(DiagnosticsEngine &diags) : DiagsRef(diags) {} + + /// Run some code with "sufficient" stack space. (Currently, at least 256K + /// is guaranteed). Produces a warning if we're low on stack space and + /// allocates more in that case. Use this in code that may recurse deeply to + /// avoid stack overflow. + void runWithSufficientStackSpace(SourceLocation Loc, + llvm::function_ref Fn); + + /// Check to see if we're low on stack space and produce a warning if we're + /// low on stack space (Currently, at least 256Kis guaranteed). + void warnOnStackNearlyExhausted(SourceLocation Loc); + +private: + /// Warn that the stack is nearly exhausted. + void warnStackExhausted(SourceLocation Loc); + + DiagnosticsEngine &DiagsRef; + bool WarnedStackExhausted = false; +}; +} // end namespace clang + +#endif // LLVM_CLANG_BASIC_STACK_EXHAUSTION_HANDLER_H diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 2c5769f8469e..bc9c422ed4c4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -49,6 +49,7 @@ #include "clang/Basic/PragmaKinds.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/StackExhaustionHandler.h" #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" @@ -546,9 +547,6 @@ public: /// Print out statistics about the semantic analysis. void PrintStats() const; - /// Warn that the stack is nearly exhausted. - void warnStackExhausted(SourceLocation Loc); - /// Run some code with "sufficient" stack space. (Currently, at least 256K is /// guaranteed). Produces a warning if we're low on stack space and allocates /// more in that case. Use this in code that may recurse deeply (for example, @@ -1183,7 +1181,7 @@ private: std::optional> CachedDarwinSDKInfo; bool WarnedDarwinSDKInfoMissing = false; - bool WarnedStackExhausted = false; + StackExhaustionHandler StackHandler; Sema(const Sema &) = delete; void operator=(const Sema &) = delete; diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index ee4e897b2488..b476a40ebd2c 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -19,6 +19,7 @@ #include "clang/Basic/IdentifierTable.h" #include "clang/Basic/OpenCLOptions.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Basic/StackExhaustionHandler.h" #include "clang/Basic/Version.h" #include "clang/Lex/ExternalPreprocessorSource.h" #include "clang/Lex/HeaderSearch.h" @@ -445,7 +446,7 @@ private: DiagnosticsEngine &Diags; // Sema has duplicate logic, but SemaObj can sometimes be null so ASTReader // has its own version. - bool WarnedStackExhausted = false; + StackExhaustionHandler StackHandler; /// The semantic analysis object that will be processing the /// AST files and the translation unit that uses it. @@ -2180,7 +2181,8 @@ public: /// Report a diagnostic. DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const; - void warnStackExhausted(SourceLocation Loc); + void runWithSufficientStackSpace(SourceLocation Loc, + llvm::function_ref Fn); IdentifierInfo *DecodeIdentifierInfo(serialization::IdentifierID ID); diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt index e7ebc8f191aa..e11e1ac4a6fa 100644 --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -89,6 +89,7 @@ add_clang_library(clangBasic SourceManager.cpp SourceMgrAdapter.cpp Stack.cpp + StackExhaustionHandler.cpp TargetID.cpp TargetInfo.cpp Targets.cpp diff --git a/clang/lib/Basic/StackExhaustionHandler.cpp b/clang/lib/Basic/StackExhaustionHandler.cpp new file mode 100644 index 000000000000..24b499c810db --- /dev/null +++ b/clang/lib/Basic/StackExhaustionHandler.cpp @@ -0,0 +1,35 @@ +//===--- StackExhaustionHandler.cpp - - A utility for warning once when close +// to out of stack space -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines a utilitiy for warning once when close to out of stack space. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/StackExhaustionHandler.h" +#include "clang/Basic/Stack.h" + +void clang::StackExhaustionHandler::runWithSufficientStackSpace( + SourceLocation Loc, llvm::function_ref Fn) { + clang::runWithSufficientStackSpace([&] { warnStackExhausted(Loc); }, Fn); +} + +void clang::StackExhaustionHandler::warnOnStackNearlyExhausted( + SourceLocation Loc) { + if (isStackNearlyExhausted()) + warnStackExhausted(Loc); +} + +void clang::StackExhaustionHandler::warnStackExhausted(SourceLocation Loc) { + // Only warn about this once. + if (!WarnedStackExhausted) { + DiagsRef.Report(Loc, diag::warn_stack_exhausted); + WarnedStackExhausted = true; + } +} diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 9a84a11973b1..24655b809b2e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -342,7 +342,7 @@ CodeGenModule::CodeGenModule(ASTContext &C, : Context(C), LangOpts(C.getLangOpts()), FS(FS), HeaderSearchOpts(HSO), PreprocessorOpts(PPO), CodeGenOpts(CGO), TheModule(M), Diags(diags), Target(C.getTargetInfo()), ABI(createCXXABI(*this)), - VMContext(M.getContext()), VTables(*this), + VMContext(M.getContext()), VTables(*this), StackHandler(diags), SanitizerMD(new SanitizerMetadata(*this)) { // Initialize the type cache. @@ -1595,17 +1595,9 @@ void CodeGenModule::ErrorUnsupported(const Decl *D, const char *Type) { getDiags().Report(Context.getFullLoc(D->getLocation()), DiagID) << Msg; } -void CodeGenModule::warnStackExhausted(SourceLocation Loc) { - // Only warn about this once. - if (!WarnedStackExhausted) { - getDiags().Report(Loc, diag::warn_stack_exhausted); - WarnedStackExhausted = true; - } -} - void CodeGenModule::runWithSufficientStackSpace(SourceLocation Loc, llvm::function_ref Fn) { - clang::runWithSufficientStackSpace([&] { warnStackExhausted(Loc); }, Fn); + StackHandler.runWithSufficientStackSpace(Loc, Fn); } llvm::ConstantInt *CodeGenModule::getSize(CharUnits size) { diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index fa82a81b05dd..1b77490e261c 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -26,6 +26,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/NoSanitizeList.h" #include "clang/Basic/ProfileList.h" +#include "clang/Basic/StackExhaustionHandler.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/XRayLists.h" #include "clang/Lex/PreprocessorOptions.h" @@ -336,7 +337,7 @@ private: std::unique_ptr PGOReader; InstrProfStats PGOStats; std::unique_ptr SanStats; - bool WarnedStackExhausted = false; + StackExhaustionHandler StackHandler; // A set of references that have only been seen via a weakref so far. This is // used to remove the weak of the reference if we ever see a direct reference @@ -1298,9 +1299,6 @@ public: /// Print out an error that codegen doesn't support the specified decl yet. void ErrorUnsupported(const Decl *D, const char *Type); - /// Warn that the stack is nearly exhausted. - void warnStackExhausted(SourceLocation Loc); - /// Run some code with "sufficient" stack space. (Currently, at least 256K is /// guaranteed). Produces a warning if we're low on stack space and allocates /// more in that case. Use this in code that may recurse deeply to avoid stack diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index f0d1634af529..5e9886a10946 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -220,7 +220,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, AnalysisWarnings(*this), ThreadSafetyDeclCache(nullptr), LateTemplateParser(nullptr), LateTemplateParserCleanup(nullptr), OpaqueParser(nullptr), CurContext(nullptr), ExternalSource(nullptr), - CurScope(nullptr), Ident_super(nullptr), + StackHandler(Diags), CurScope(nullptr), Ident_super(nullptr), AMDGPUPtr(std::make_unique(*this)), ARMPtr(std::make_unique(*this)), AVRPtr(std::make_unique(*this)), @@ -562,17 +562,9 @@ Sema::~Sema() { SemaPPCallbackHandler->reset(); } -void Sema::warnStackExhausted(SourceLocation Loc) { - // Only warn about this once. - if (!WarnedStackExhausted) { - Diag(Loc, diag::warn_stack_exhausted); - WarnedStackExhausted = true; - } -} - void Sema::runWithSufficientStackSpace(SourceLocation Loc, llvm::function_ref Fn) { - clang::runWithSufficientStackSpace([&] { warnStackExhausted(Loc); }, Fn); + StackHandler.runWithSufficientStackSpace(Loc, Fn); } bool Sema::makeUnavailableInSystemHeader(SourceLocation loc, diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 8665c099903d..457a9968c32a 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -806,8 +806,7 @@ void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) { // Check to see if we're low on stack space. We can't do anything about this // from here, but we can at least warn the user. - if (isStackNearlyExhausted()) - warnStackExhausted(Ctx.PointOfInstantiation); + StackHandler.warnOnStackNearlyExhausted(Ctx.PointOfInstantiation); } void Sema::popCodeSynthesisContext() { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 1b2473f24573..1cf6c9352f36 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -64,6 +64,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Basic/SourceManagerInternals.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/Stack.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "clang/Basic/TokenKinds.h" @@ -9648,18 +9649,15 @@ DiagnosticBuilder ASTReader::Diag(SourceLocation Loc, unsigned DiagID) const { return Diags.Report(Loc, DiagID); } -void ASTReader::warnStackExhausted(SourceLocation Loc) { +void ASTReader::runWithSufficientStackSpace(SourceLocation Loc, + llvm::function_ref Fn) { // When Sema is available, avoid duplicate errors. if (SemaObj) { - SemaObj->warnStackExhausted(Loc); + SemaObj->runWithSufficientStackSpace(Loc, Fn); return; } - if (WarnedStackExhausted) - return; - WarnedStackExhausted = true; - - Diag(Loc, diag::warn_stack_exhausted); + StackHandler.runWithSufficientStackSpace(Loc, Fn); } /// Retrieve the identifier table associated with the @@ -10509,13 +10507,14 @@ ASTReader::ASTReader(Preprocessor &PP, InMemoryModuleCache &ModuleCache, bool AllowConfigurationMismatch, bool ValidateSystemInputs, bool ValidateASTInputFilesContent, bool UseGlobalIndex, std::unique_ptr ReadTimer) - : Listener(bool(DisableValidationKind &DisableValidationForModuleKind::PCH) + : Listener(bool(DisableValidationKind & DisableValidationForModuleKind::PCH) ? cast(new SimpleASTReaderListener(PP)) : cast(new PCHValidator(PP, *this))), SourceMgr(PP.getSourceManager()), FileMgr(PP.getFileManager()), - PCHContainerRdr(PCHContainerRdr), Diags(PP.getDiagnostics()), PP(PP), - ContextObj(Context), ModuleMgr(PP.getFileManager(), ModuleCache, - PCHContainerRdr, PP.getHeaderSearchInfo()), + PCHContainerRdr(PCHContainerRdr), Diags(PP.getDiagnostics()), + StackHandler(Diags), PP(PP), ContextObj(Context), + ModuleMgr(PP.getFileManager(), ModuleCache, PCHContainerRdr, + PP.getHeaderSearchInfo()), DummyIdResolver(PP), ReadTimer(std::move(ReadTimer)), isysroot(isysroot), DisableValidationKind(DisableValidationKind), AllowASTWithCompilerErrors(AllowASTWithCompilerErrors), diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 1ccc810f415e..d4e392dcc6bc 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -4168,8 +4168,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { D->setDeclContext(Context.getTranslationUnitDecl()); // Reading some declarations can result in deep recursion. - clang::runWithSufficientStackSpace([&] { warnStackExhausted(DeclLoc); }, - [&] { Reader.Visit(D); }); + runWithSufficientStackSpace(DeclLoc, [&] { Reader.Visit(D); }); // If this declaration is also a declaration context, get the // offsets for its tables of lexical and visible declarations. -- GitLab From abfba7d2e6a3cb0f1d0c976898447957dbbca6e0 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Fri, 18 Oct 2024 10:18:34 +0200 Subject: [PATCH 015/511] [clang] Fix C23 constexpr crashes (#112708) Before using a constexpr variable that is not properly initialized check that it is valid. Fixes https://github.com/llvm/llvm-project/issues/109095 Fixes https://github.com/llvm/llvm-project/issues/112516 --- clang/lib/AST/Decl.cpp | 10 +++++++--- clang/test/Sema/constexpr.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index f083ffff87a8..8321cee0e0bc 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -2512,7 +2512,8 @@ bool VarDecl::isUsableInConstantExpressions(const ASTContext &Context) const { if (!DefVD->mightBeUsableInConstantExpressions(Context)) return false; // ... and its initializer is a constant initializer. - if (Context.getLangOpts().CPlusPlus && !DefVD->hasConstantInitialization()) + if ((Context.getLangOpts().CPlusPlus || getLangOpts().C23) && + !DefVD->hasConstantInitialization()) return false; // C++98 [expr.const]p1: // An integral constant-expression can involve only [...] const variables @@ -2619,8 +2620,11 @@ bool VarDecl::hasICEInitializer(const ASTContext &Context) const { } bool VarDecl::hasConstantInitialization() const { - // In C, all globals (and only globals) have constant initialization. - if (hasGlobalStorage() && !getASTContext().getLangOpts().CPlusPlus) + // In C, all globals and constexpr variables should have constant + // initialization. For constexpr variables in C check that initializer is a + // constant initializer because they can be used in constant expressions. + if (hasGlobalStorage() && !getASTContext().getLangOpts().CPlusPlus && + !isConstexpr()) return true; // In C++, it depends on whether the evaluation at the point of definition diff --git a/clang/test/Sema/constexpr.c b/clang/test/Sema/constexpr.c index eaa000b3b977..3dcb0b3a7d95 100644 --- a/clang/test/Sema/constexpr.c +++ b/clang/test/Sema/constexpr.c @@ -374,3 +374,20 @@ void constexprif() { void constevalif() { if consteval (300) {} //expected-error {{expected '(' after 'if'}} } + +struct S11 { + int len; +}; +void ghissue112516() { + struct S11 *s11 = 0; + constexpr int num = s11->len; // expected-error {{constexpr variable 'num' must be initialized by a constant expression}} + void *Arr[num]; +} + +void ghissue109095() { + constexpr char c[] = { 'a' }; + constexpr int i = c[1]; // expected-error {{constexpr variable 'i' must be initialized by a constant expression}}\ + // expected-note {{declared here}} + _Static_assert(i == c[0]); // expected-error {{static assertion expression is not an integral constant expression}}\ + // expected-note {{initializer of 'i' is not a constant expression}} +} -- GitLab From 0d1a91e8f91e364b83f77e597dfb835d70fe9cf9 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Fri, 18 Oct 2024 09:21:19 +0100 Subject: [PATCH 016/511] [FuncSpec] Update MinFunctionSize logic (#112711) Always require functions to be larger than MinFunctionSize when SpecializeLiteralConstant is enabled, and increase MinFunctionSize to 500, to prevent excessive triggering of specialisations on small functions. --- .../lib/Transforms/IPO/FunctionSpecialization.cpp | 15 ++++++++++----- .../Transforms/FunctionSpecialization/noinline.ll | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index bd0a337e579e..7feebbe420ae 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -57,9 +57,9 @@ static cl::opt MaxBlockPredecessors( "considered during the estimation of dead code")); static cl::opt MinFunctionSize( - "funcspec-min-function-size", cl::init(300), cl::Hidden, cl::desc( - "Don't specialize functions that have less than this number of " - "instructions")); + "funcspec-min-function-size", cl::init(500), cl::Hidden, + cl::desc("Don't specialize functions that have less than this number of " + "instructions")); static cl::opt MaxCodeSizeGrowth( "funcspec-max-codesize-growth", cl::init(3), cl::Hidden, cl::desc( @@ -641,12 +641,17 @@ bool FunctionSpecializer::run() { Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues); } + // When specializing literal constants is enabled, always require functions + // to be larger than MinFunctionSize, to prevent excessive specialization. + const bool RequireMinSize = + !ForceSpecialization && + (SpecializeLiteralConstant || !F.hasFnAttribute(Attribute::NoInline)); + // If the code metrics reveal that we shouldn't duplicate the function, // or if the code size implies that this function is easy to get inlined, // then we shouldn't specialize it. if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || - (!ForceSpecialization && !F.hasFnAttribute(Attribute::NoInline) && - Metrics.NumInsts < MinFunctionSize)) + (RequireMinSize && Metrics.NumInsts < MinFunctionSize)) continue; // TODO: For now only consider recursive functions when running multiple diff --git a/llvm/test/Transforms/FunctionSpecialization/noinline.ll b/llvm/test/Transforms/FunctionSpecialization/noinline.ll index 73576402b002..34a8ecbcf7c0 100644 --- a/llvm/test/Transforms/FunctionSpecialization/noinline.ll +++ b/llvm/test/Transforms/FunctionSpecialization/noinline.ll @@ -1,4 +1,4 @@ -; RUN: opt -S --passes="ipsccp" < %s | FileCheck %s +; RUN: opt -S --passes="ipsccp" -funcspec-for-literal-constant=false < %s | FileCheck %s define dso_local i32 @p0(i32 noundef %x) { entry: %add = add nsw i32 %x, 1 -- GitLab From d1ee850743c29bd2064b9a308e84f048827f143e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 18 Oct 2024 08:25:04 +0000 Subject: [PATCH 017/511] [gn build] Port 09cc75e2ccc3 --- llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index 84d569d34265..1b193af6c30a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -96,6 +96,7 @@ static_library("Basic") { "SourceManager.cpp", "SourceMgrAdapter.cpp", "Stack.cpp", + "StackExhaustionHandler.cpp", "TargetID.cpp", "TargetInfo.cpp", "Targets.cpp", -- GitLab From 2f15d7e43e17f72839861bfe3a5c466c325bc04d Mon Sep 17 00:00:00 2001 From: Vinayak Dev <104419489+vinayakdsci@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:02:30 +0530 Subject: [PATCH 018/511] [mlir][tensor] Fix off-by-one error in ReshapeOpsUtils (#112774) This patch fixes an off-by-one error in `mlir::getReassociationIndicesForCollapse()` that occurs when the last two dims of the source tensor satisfy the while loop. This would cause an assertion failure due to out-of-bounds-access, which is now fixed. --- mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 2 +- mlir/test/Dialect/Tensor/canonicalize.mlir | 23 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index d2ab4cabb32b..70b2aaf9a17e 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -47,7 +47,7 @@ mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, break; int64_t currTargetShape = targetShape[targetDim]; - while (sourceDim < sourceShape.size() && + while (sourceDim < (sourceShape.size() - 1) && sourceShape[sourceDim] != ShapedType::kDynamic && prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) { prodOfCollapsedDims *= sourceShape[sourceDim]; diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 0aa2d33ef17e..dbf0f0b81f61 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1251,6 +1251,29 @@ func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: // ----- +func.func @compose_expand_of_collapse_last_two_dims(%arg0: tensor) -> tensor { + %collapsed = tensor.collapse_shape %arg0 [[0, 1, 2]] : tensor into tensor + %c0 = arith.constant 0 : index + %dim = tensor.dim %collapsed, %c0 : tensor + %c384= arith.constant 384 : index + %div = arith.divui %dim, %c384 : index + %expanded = tensor.expand_shape %collapsed [[0, 1]] output_shape [%div, 384] : tensor into tensor + return %expanded : tensor +} +// CHECK: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 64)> +// CHECK-LABEL: @compose_expand_of_collapse_last_two_dims +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK: %[[CONSTANT0:.+]] = arith.constant 0 : index +// CHECK: %[[CONSTANT384:.+]] = arith.constant 384 : index +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2]] : tensor into tensor +// CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[CONSTANT0]] : tensor +// CHECK: %[[AFFAPPLY:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]] +// CHECK: %[[DIVUI:.+]] = arith.divui %[[AFFAPPLY]], %[[CONSTANT384]] : index +// CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0, 1]] output_shape [%[[DIVUI]], 384] : tensor into tensor +// CHECK: return %[[RESULT]] + +// ----- + func.func @compose_expand_of_collapse(%arg0 : tensor<2x3x4x5x6x7x8xf32>) -> tensor<24x5x42x8xf32> { %0 = tensor.collapse_shape %arg0 [[0, 1, 2, 3, 4, 5, 6]] -- GitLab From bafc66e50f623a34eb23a14dd66bdbee944cd197 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 18 Oct 2024 10:41:16 +0200 Subject: [PATCH 019/511] [AMDGPU][NFC] Correct description (#112847) --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5b74022457c2..722a79be915d 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -41,9 +41,9 @@ /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then /// // block /// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask -/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// S_CBRANCH_EXECZ label1 // Use our branch optimization /// // instruction again. -/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block +/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the ELSE block /// label1: /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits //===----------------------------------------------------------------------===// -- GitLab From 7eaf92b3e4db5e3be9e9ee137866090d66dd08fb Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 18 Oct 2024 09:47:20 +0100 Subject: [PATCH 020/511] Sink/test: increase coverage of invariant-load (#112690) Tests adapted from: https://discourse.llvm.org/t/sinking-does-any-llvm-pass-currently-handle-load-sinking-for-invariant-loads/79643 We don't add tests for llvm.invariant.{start,end} though, as these are very difficult to support architecturally. --- llvm/test/Transforms/Sink/invariant-load.ll | 67 ++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/Sink/invariant-load.ll b/llvm/test/Transforms/Sink/invariant-load.ll index 1aab4a969632..c8fb119acd30 100644 --- a/llvm/test/Transforms/Sink/invariant-load.ll +++ b/llvm/test/Transforms/Sink/invariant-load.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=sink -S < %s | FileCheck %s -; Loads marked invariant can be sunk across critical edges +; Loads marked invariant can be sunk across critical edges. define <4 x float> @invariant_load(ptr %in, i32 %s) { ; CHECK-LABEL: @invariant_load( @@ -12,7 +12,7 @@ define <4 x float> @invariant_load(ptr %in, i32 %s) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[V:%.*]] = load <4 x float>, ptr [[IN:%.*]], align 16, !invariant.load !0 +; CHECK-NEXT: [[V:%.*]] = load <4 x float>, ptr [[IN:%.*]], align 16, !invariant.load [[META0:![0-9]+]] ; CHECK-NEXT: ret <4 x float> [[V]] ; main_body: @@ -26,4 +26,67 @@ end: ret <4 x float> %v } +; Loads that aren't marked invariant but used in one branch +; can be sunk to that branch. + +define void @invariant_load_use_in_br(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_use_in_br( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE_BR:%.*]], label [[FALSE_BR:%.*]] +; CHECK: true.br: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: false.br: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: call void @fn(i32 [[VAL]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %val = load i32, ptr %p + br i1 %cond, label %true.br, label %false.br +true.br: + call void @fn() + br label %exit +false.br: + call void @fn(i32 %val) + br label %exit +exit: + ret void +} + +; TODO: Invariant loads marked with metadata can be sunk past calls. + +define void @invariant_load_metadata_call(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_metadata_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE_BR:%.*]], label [[FALSE_BR:%.*]] +; CHECK: true.br: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: false.br: +; CHECK-NEXT: call void @fn(i32 [[VAL]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %val = load i32, ptr %p, !invariant.load !0 + call void @fn() + br i1 %cond, label %true.br, label %false.br +true.br: + call void @fn() + br label %exit +false.br: + call void @fn(i32 %val) + br label %exit +exit: + ret void +} + +declare void @fn() + !0 = !{} -- GitLab From 2f792f6e7157751441b06c7212edfea1a0651a27 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 18 Oct 2024 09:57:25 +0100 Subject: [PATCH 021/511] [AArch64][GlobalISel] Add some post-legalization cast combines. (#112509) This helps clear up some of the legalization artefacts. Not all of the cast_combines are added (notably select combines) as they currently have questionable benefit in the test updates. --- .../include/llvm/Target/GlobalISel/Combine.td | 14 ++- llvm/lib/Target/AArch64/AArch64Combine.td | 6 +- .../AArch64/GlobalISel/combine-cast.mir | 21 ++-- .../AArch64/GlobalISel/combine-trunc.mir | 98 ++++++------------- llvm/test/CodeGen/AArch64/add.ll | 6 +- llvm/test/CodeGen/AArch64/and-mask-removal.ll | 4 +- llvm/test/CodeGen/AArch64/andorxor.ll | 18 +--- llvm/test/CodeGen/AArch64/bitcast.ll | 20 +--- llvm/test/CodeGen/AArch64/concat-vector.ll | 91 ++++------------- llvm/test/CodeGen/AArch64/fcmp.ll | 11 ++- llvm/test/CodeGen/AArch64/itofp.ll | 5 +- llvm/test/CodeGen/AArch64/mul.ll | 6 +- llvm/test/CodeGen/AArch64/sub.ll | 6 +- 13 files changed, 88 insertions(+), 218 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 77cb4370b546..d0373a7dadfc 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1791,20 +1791,24 @@ class integer_of_opcode : GICombineRule < def integer_of_truncate : integer_of_opcode; -def cast_combines: GICombineGroup<[ +def cast_of_cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, truncate_of_anyext, - select_of_zext, - select_of_anyext, - select_of_truncate, zext_of_zext, zext_of_anyext, sext_of_sext, sext_of_anyext, anyext_of_anyext, anyext_of_zext, - anyext_of_sext, + anyext_of_sext +]>; + +def cast_combines: GICombineGroup<[ + cast_of_cast_combines, + select_of_zext, + select_of_anyext, + select_of_truncate, buildvector_of_truncate, narrow_binop_add, narrow_binop_sub, diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ead6455ddd52..321190c83b79 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -314,9 +314,9 @@ def AArch64PostLegalizerLowering // Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", - [copy_prop, combines_for_extload, - combine_indexed_load_store, - sext_trunc_sextload, mutate_anyext_to_zext, + [copy_prop, cast_of_cast_combines, buildvector_of_truncate, + integer_of_truncate, mutate_anyext_to_zext, + combines_for_extload, combine_indexed_load_store, sext_trunc_sextload, hoist_logic_op_with_same_opcode_hands, redundant_and, xor_of_and_with_same_reg, extractvecelt_pairwise_add, redundant_or, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir index ae04cc77dcaf..b045deebc56e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir @@ -135,20 +135,13 @@ name: test_combine_trunc_build_vector legalized: true body: | bb.1: - ; CHECK-PRE-LABEL: name: test_combine_trunc_build_vector - ; CHECK-PRE: %arg1:_(s64) = COPY $x0 - ; CHECK-PRE-NEXT: %arg2:_(s64) = COPY $x0 - ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64) - ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %arg2(s64) - ; CHECK-PRE-NEXT: %small:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32) - ; CHECK-PRE-NEXT: $x0 = COPY %small(<2 x s32>) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_build_vector - ; CHECK-POST: %arg1:_(s64) = COPY $x0 - ; CHECK-POST-NEXT: %arg2:_(s64) = COPY $x0 - ; CHECK-POST-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) - ; CHECK-POST-NEXT: %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>) - ; CHECK-POST-NEXT: $x0 = COPY %small(<2 x s32>) + ; CHECK-LABEL: name: test_combine_trunc_build_vector + ; CHECK: %arg1:_(s64) = COPY $x0 + ; CHECK-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %arg2(s64) + ; CHECK-NEXT: %small:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32) + ; CHECK-NEXT: $x0 = COPY %small(<2 x s32>) %arg1:_(s64) = COPY $x0 %arg2:_(s64) = COPY $x0 %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir index 4a38b5d4c63d..9a2b9dd4b2b6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir @@ -32,20 +32,12 @@ legalized: true body: | bb.1: liveins: $h0 - ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s16 - ; CHECK-PRE: liveins: $h0 - ; CHECK-PRE-NEXT: {{ $}} - ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16) - ; CHECK-PRE-NEXT: $w0 = COPY [[ANYEXT]](s32) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s16 - ; CHECK-POST: liveins: $h0 - ; CHECK-POST-NEXT: {{ $}} - ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16) - ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64) - ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16 + ; CHECK: liveins: $h0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) %0:_(s16) = COPY $h0 %1:_(s64) = G_ANYEXT %0(s16) %2:_(s32) = G_TRUNC %1(s64) @@ -82,20 +74,12 @@ legalized: true body: | bb.1: liveins: $h0 - ; CHECK-PRE-LABEL: name: test_combine_trunc_sext_s32_s16 - ; CHECK-PRE: liveins: $h0 - ; CHECK-PRE-NEXT: {{ $}} - ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-PRE-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16) - ; CHECK-PRE-NEXT: $w0 = COPY [[SEXT]](s32) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_sext_s32_s16 - ; CHECK-POST: liveins: $h0 - ; CHECK-POST-NEXT: {{ $}} - ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-POST-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16) - ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT]](s64) - ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-LABEL: name: test_combine_trunc_sext_s32_s16 + ; CHECK: liveins: $h0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16) + ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32) %0:_(s16) = COPY $h0 %1:_(s64) = G_SEXT %0(s16) %2:_(s32) = G_TRUNC %1(s64) @@ -107,20 +91,12 @@ legalized: true body: | bb.1: liveins: $h0 - ; CHECK-PRE-LABEL: name: test_combine_trunc_zext_s32_s16 - ; CHECK-PRE: liveins: $h0 - ; CHECK-PRE-NEXT: {{ $}} - ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16) - ; CHECK-PRE-NEXT: $w0 = COPY [[ZEXT]](s32) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_zext_s32_s16 - ; CHECK-POST: liveins: $h0 - ; CHECK-POST-NEXT: {{ $}} - ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 - ; CHECK-POST-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16) - ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ZEXT]](s64) - ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-LABEL: name: test_combine_trunc_zext_s32_s16 + ; CHECK: liveins: $h0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16) + ; CHECK-NEXT: $w0 = COPY [[ZEXT]](s32) %0:_(s16) = COPY $h0 %1:_(s64) = G_ZEXT %0(s16) %2:_(s32) = G_TRUNC %1(s64) @@ -132,19 +108,11 @@ legalized: true body: | bb.1: liveins: $w0 - ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s32 - ; CHECK-PRE: liveins: $w0 - ; CHECK-PRE-NEXT: {{ $}} - ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK-PRE-NEXT: $w0 = COPY [[COPY]](s32) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s32 - ; CHECK-POST: liveins: $w0 - ; CHECK-POST-NEXT: {{ $}} - ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) - ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64) - ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) %0:_(s32) = COPY $w0 %1:_(s64) = G_ANYEXT %0(s32) %2:_(s32) = G_TRUNC %1(s64) @@ -156,20 +124,12 @@ legalized: true body: | bb.1: liveins: $x0 - ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s64 - ; CHECK-PRE: liveins: $x0 - ; CHECK-PRE-NEXT: {{ $}} - ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-PRE-NEXT: $w0 = COPY [[TRUNC]](s32) - ; - ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s64 - ; CHECK-POST: liveins: $x0 - ; CHECK-POST-NEXT: {{ $}} - ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 - ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s128) = G_ANYEXT [[COPY]](s64) - ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s128) - ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s64 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) %0:_(s64) = COPY $x0 %1:_(s128) = G_ANYEXT %0(s64) %2:_(s32) = G_TRUNC %1(s128) diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index fc1a0c71d4cd..ce7e3101a7a5 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -171,11 +171,7 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll index f005ca47ad12..09f00b3845f2 100644 --- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll +++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll @@ -530,10 +530,10 @@ define i64 @test_2_selects(i8 zeroext %a) { ; CHECK-LABEL: test_2_selects: ; CHECK: ; %bb.0: ; CHECK-NEXT: add w9, w0, #24 -; CHECK-NEXT: mov w8, #131 +; CHECK-NEXT: mov w8, #131 ; =0x83 ; CHECK-NEXT: and w9, w9, #0xff ; CHECK-NEXT: cmp w9, #81 -; CHECK-NEXT: mov w9, #57 +; CHECK-NEXT: mov w9, #57 ; =0x39 ; CHECK-NEXT: csel x8, x8, xzr, lo ; CHECK-NEXT: csel x9, xzr, x9, eq ; CHECK-NEXT: add x0, x8, x9 diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 5385a917619f..459daece90de 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -463,11 +463,7 @@ define void @and_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -514,11 +510,7 @@ define void @or_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret @@ -565,11 +557,7 @@ define void @xor_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 79cfeedb74bc..bbdf8b0a13d3 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -60,11 +60,7 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-LABEL: bitcast_v4i8_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = add <4 x i8> %a, %b @@ -116,9 +112,7 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-LABEL: bitcast_v2i16_i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: xtn v0.4h, v1.4s +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret %c = add <2 x i16> %a, %b @@ -418,9 +412,7 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-LABEL: bitcast_v2i16_v4i8: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: xtn v0.4h, v1.4s +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: mov b1, v0.b[1] ; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] ; CHECK-GI-NEXT: mov b3, v0.b[2] @@ -455,11 +447,7 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-LABEL: bitcast_v4i8_v2i16: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index d800b2549cf2..0033999b9bd5 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -33,18 +33,8 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) { ; ; CHECK-GI-LABEL: concat2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v3.h[0], v1.h[0] -; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v3.h[1], v1.h[1] -; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v3.h[2], v1.h[2] -; CHECK-GI-NEXT: mov v2.h[3], v0.h[3] -; CHECK-GI-NEXT: mov v3.h[3], v1.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v2.8h -; CHECK-GI-NEXT: xtn v1.8b, v3.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 @@ -74,15 +64,9 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) { ; ; CHECK-GI-LABEL: concat4: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] -; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] -; CHECK-GI-NEXT: xtn v2.4h, v2.4s -; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] -; CHECK-GI-NEXT: xtn v1.4h, v0.4s -; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 @@ -183,12 +167,11 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v8s16_v2s16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x0, #2] -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: ldrh w8, [x0] +; CHECK-GI-NEXT: ldrh w9, [x0, #2] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] ; CHECK-GI-NEXT: ret %a = load <2 x i16>, ptr %ptr %b = shufflevector <2 x i16> %a, <2 x i16> %a, <8 x i32> @@ -238,34 +221,14 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, < ; ; CHECK-GI-LABEL: concat_v16s8_v4s8_reg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v4.h[0], v0.h[0] -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v5.h[0], v1.h[0] -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: mov v6.h[0], v2.h[0] -; CHECK-GI-NEXT: mov v7.h[0], v3.h[0] -; CHECK-GI-NEXT: mov v4.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v5.h[1], v1.h[1] -; CHECK-GI-NEXT: mov v6.h[1], v2.h[1] -; CHECK-GI-NEXT: mov v7.h[1], v3.h[1] -; CHECK-GI-NEXT: mov v4.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v5.h[2], v1.h[2] -; CHECK-GI-NEXT: mov v6.h[2], v2.h[2] -; CHECK-GI-NEXT: mov v7.h[2], v3.h[2] -; CHECK-GI-NEXT: mov v4.h[3], v0.h[3] -; CHECK-GI-NEXT: mov v5.h[3], v1.h[3] -; CHECK-GI-NEXT: mov v6.h[3], v2.h[3] -; CHECK-GI-NEXT: mov v7.h[3], v3.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v4.8h -; CHECK-GI-NEXT: xtn v1.8b, v5.8h -; CHECK-GI-NEXT: xtn v2.8b, v6.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: xtn v1.8b, v7.8h +; CHECK-GI-NEXT: uzp1 v2.8b, v2.8b, v0.8b ; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v3.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: fmov w8, s1 @@ -291,29 +254,17 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> % ; ; CHECK-GI-LABEL: concat_v8s16_v2s16_reg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v4.s[0], v0.s[0] -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v5.s[0], v1.s[0] -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: mov v4.s[1], v0.s[1] -; CHECK-GI-NEXT: mov v5.s[1], v1.s[1] -; CHECK-GI-NEXT: mov v1.s[0], v2.s[0] -; CHECK-GI-NEXT: xtn v0.4h, v4.4s -; CHECK-GI-NEXT: xtn v4.4h, v5.4s -; CHECK-GI-NEXT: mov v1.s[1], v2.s[1] -; CHECK-GI-NEXT: mov v2.s[0], v3.s[0] +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: mov v2.s[1], v3.s[1] ; CHECK-GI-NEXT: mov v0.s[0], w8 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: xtn v2.4h, v2.4s -; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: uzp1 v2.4h, v2.4h, v0.4h +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: uzp1 v1.4h, v3.4h, v0.4h ; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[3], w8 ; CHECK-GI-NEXT: ret %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index baab53d8bdbd..66f26fc9d859 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -922,26 +922,27 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x ; CHECK-GI-LABEL: v3f64_i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: mov w8, #31 // =0x1f ; CHECK-GI-NEXT: fcmp d2, d5 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] ; CHECK-GI-NEXT: mov v1.s[0], w8 ; CHECK-GI-NEXT: cset w9, mi -; CHECK-GI-NEXT: mov v2.d[0], x9 +; CHECK-GI-NEXT: mov v2.s[0], w9 ; CHECK-GI-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-NEXT: fcmgt v0.2d, v3.2d, v0.2d ; CHECK-GI-NEXT: mov v1.s[1], w8 ; CHECK-GI-NEXT: mov v3.s[0], w9 +; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: mov v3.s[1], w9 +; CHECK-GI-NEXT: mov v0.d[1], v2.d[0] +; CHECK-GI-NEXT: mov v3.s[2], w9 ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: neg v1.4s, v1.4s -; CHECK-GI-NEXT: mov v3.s[2], w9 ; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v3.16b ; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index c5bde81ba4a5..81c1a64f2d43 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -7937,10 +7937,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-FP16-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-FP16-NEXT: xtn v0.4h, v1.4s +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 9e748c9641aa..5e7f71c18c27 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -183,11 +183,7 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 8e7586bd4843..c298e6d8a1ff 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -171,11 +171,7 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 ; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] -; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] -; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] -; CHECK-GI-NEXT: xtn v0.8b, v1.8h +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: str w8, [x0] ; CHECK-GI-NEXT: ret -- GitLab From c72992bf897b22465b2c80343b1b4a5afd1508ef Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 17 Oct 2024 17:48:52 +0100 Subject: [PATCH 022/511] [DAG] visitABS - use FoldConstantArithmetic to attempt to constant fold Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us. Cleanup for #112682 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 18439b87a83b..2f829be97b07 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18433,10 +18433,11 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { SDValue DAGCombiner::visitFABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); // fold (fabs c1) -> fabs(c1) - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0})) + return C; // fold (fabs (fabs x)) -> (fabs x) if (N0.getOpcode() == ISD::FABS) @@ -18445,7 +18446,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { // fold (fabs (fneg x)) -> (fabs x) // fold (fabs (fcopysign x, y)) -> (fabs x) if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); if (SDValue Cast = foldSignChangeInBitcast(N)) return Cast; -- GitLab From 7a43be1690e27ddf8813e49d93eb419d214fcd7a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 17 Oct 2024 18:15:54 +0100 Subject: [PATCH 023/511] [DAG] visitXROUND - use FoldConstantArithmetic to attempt to constant fold Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2f829be97b07..b717847fc302 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18156,8 +18156,9 @@ SDValue DAGCombiner::visitXROUND(SDNode *N) { // fold (lrint|llrint c1fp) -> c1 // fold (lround|llround c1fp) -> c1 - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0); + if (SDValue C = + DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0})) + return C; return SDValue(); } -- GitLab From 3a1df05ca91fa6a0f893123ad08a46a443b0b486 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 09:47:17 +0100 Subject: [PATCH 024/511] [DAG] visitFP_ROUND - use FoldConstantArithmetic to attempt to constant fold Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b717847fc302..c3bcd3bb9367 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18167,10 +18167,10 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc DL(N); // fold (fp_round c1fp) -> c1fp - if (SDValue C = - DAG.FoldConstantArithmetic(ISD::FP_ROUND, SDLoc(N), VT, {N0, N1})) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_ROUND, DL, VT, {N0, N1})) return C; // fold (fp_round (fp_extend x)) -> x @@ -18201,12 +18201,10 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { // single-step fp_round we want to fold to. // In other words, double rounding isn't the same as rounding. // Also, this is a value preserving truncation iff both fp_round's are. - if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { - SDLoc DL(N); + if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) return DAG.getNode( ISD::FP_ROUND, DL, VT, N0.getOperand(0), DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL, /*isTarget=*/true)); - } } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) @@ -18220,8 +18218,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(Tmp.getNode()); - return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, - Tmp, N0.getOperand(1)); + return DAG.getNode(ISD::FCOPYSIGN, DL, VT, Tmp, N0.getOperand(1)); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) -- GitLab From 3ec1b1a4dd52641c4b84cac55ef3a228960a0bdc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 10:10:33 +0100 Subject: [PATCH 025/511] [DAG] visitFP_EXTEND - use FoldConstantArithmetic to attempt to constant fold Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c3bcd3bb9367..9946cf94a7c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18230,42 +18230,40 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (VT.isVector()) - if (SDValue FoldedVOp = SimplifyVCastOp(N, SDLoc(N))) + if (SDValue FoldedVOp = SimplifyVCastOp(N, DL)) return FoldedVOp; // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. - if (N->hasOneUse() && - N->use_begin()->getOpcode() == ISD::FP_ROUND) + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); // fold (fp_extend c1fp) -> c1fp - if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) - return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_EXTEND, DL, VT, {N0})) + return C; // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) if (N0.getOpcode() == ISD::FP16_TO_FP && TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) - return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); + return DAG.getNode(ISD::FP16_TO_FP, DL, VT, N0.getOperand(0)); // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the // value of X. - if (N0.getOpcode() == ISD::FP_ROUND - && N0.getConstantOperandVal(1) == 1) { + if (N0.getOpcode() == ISD::FP_ROUND && N0.getConstantOperandVal(1) == 1) { SDValue In = N0.getOperand(0); if (In.getValueType() == VT) return In; if (VT.bitsLT(In.getValueType())) - return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, - In, N0.getOperand(1)); - return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); + return DAG.getNode(ISD::FP_ROUND, DL, VT, In, N0.getOperand(1)); + return DAG.getNode(ISD::FP_EXTEND, DL, VT, In); } // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, + SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); -- GitLab From 5f7502bf1f193482e23385cdd4cfecf09f19ccbc Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 18 Oct 2024 10:19:22 +0100 Subject: [PATCH 026/511] [AArch64][SVE] Support lowering fixed-length BUILD_VECTORS to ZIPs (#111698) This allows lowering fixed-length (non-constant) BUILD_VECTORS (<= 128-bit) to a chain of ZIP1 instructions when Neon is not available, rather than using the default lowering, which is to spill to the stack and reload. For example, ``` t5: v4f32 = BUILD_VECTOR(t0, t1, t2, t3) ``` Becomes: ``` zip1 z0.s, z0.s, z1.s // z0 = t0,t1,... zip1 z2.s, z2.s, z3.s // z2 = t2,t3,... zip1 z0.d, z0.d, z2.d // z0 = t0,t1,t2,t3,... ``` When values are already in FRPs, this generally seems to lead to a more compact output with less movement to/from the stack. --- .../Target/AArch64/AArch64ISelLowering.cpp | 75 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 + .../sve-fixed-length-vector-shuffle-tbl.ll | 262 ++-- ...streaming-mode-fixed-length-and-combine.ll | 8 +- ...sve-streaming-mode-fixed-length-bitcast.ll | 9 +- ...treaming-mode-fixed-length-build-vector.ll | 252 ++++ .../sve-streaming-mode-fixed-length-concat.ll | 70 +- ...e-streaming-mode-fixed-length-ext-loads.ll | 44 +- ...ing-mode-fixed-length-extract-subvector.ll | 32 +- ...e-streaming-mode-fixed-length-fcopysign.ll | 52 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 692 +++++----- ...-streaming-mode-fixed-length-fp-vselect.ll | 11 +- ...ing-mode-fixed-length-insert-vector-elt.ll | 8 +- ...e-streaming-mode-fixed-length-int-to-fp.ll | 24 +- ...-streaming-mode-fixed-length-ld2-alloca.ll | 53 +- ...streaming-mode-fixed-length-masked-load.ll | 82 +- ...treaming-mode-fixed-length-masked-store.ll | 58 +- ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1132 +++++++---------- ...e-streaming-mode-fixed-length-reshuffle.ll | 16 +- 19 files changed, 1336 insertions(+), 1545 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a44a73eb2c0f..d5466e0a1cbd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -2111,7 +2112,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default); setOperationAction(ISD::BITREVERSE, VT, Default); setOperationAction(ISD::BSWAP, VT, Default); - setOperationAction(ISD::BUILD_VECTOR, VT, Default); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Default); setOperationAction(ISD::CTLZ, VT, Default); setOperationAction(ISD::CTPOP, VT, Default); @@ -14395,24 +14396,72 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, - SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE( + SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + SDLoc DL(Op); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + auto *BVN = cast(Op); - if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { - if (auto SeqInfo = cast(Op)->isConstantSequence()) { - SDLoc DL(Op); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); - SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); - SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); - return convertFromScalableVector(DAG, Op.getValueType(), Seq); - } + if (auto SeqInfo = BVN->isConstantSequence()) { + SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); + SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); + SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); + return convertFromScalableVector(DAG, VT, Seq); + } + + unsigned NumElems = VT.getVectorNumElements(); + if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 || + NumElems <= 1 || BVN->isConstant()) + return SDValue(); + + auto IsExtractElt = [](SDValue Op) { + return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT; + }; - // Revert to common legalisation for all other variants. + // For integer types that are not already in vectors limit to at most four + // elements. This is an arbitrary restriction to avoid many fmovs from GPRs. + if (VT.getScalarType().isInteger() && + NumElems - count_if(Op->op_values(), IsExtractElt) > 4) return SDValue(); + + // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s. + SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64); + SmallVector Intermediates = map_to_vector<16>( + Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) { + return Op.isUndef() ? Undef + : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + ContainerVT, Undef, Op, ZeroI64); + }); + + ElementCount ZipEC = ContainerVT.getVectorElementCount(); + while (Intermediates.size() > 1) { + EVT ZipVT = getPackedSVEVectorVT(ZipEC); + + for (unsigned I = 0; I < Intermediates.size(); I += 2) { + SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]); + SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]); + Intermediates[I / 2] = + Op1.isUndef() ? Op0 + : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1); + } + + Intermediates.resize(Intermediates.size() / 2); + ZipEC = ZipEC.divideCoefficientBy(2); } + assert(Intermediates.size() == 1); + SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]); + return convertFromScalableVector(DAG, VT, Vec); +} + +SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) + return LowerFixedLengthBuildVectorToSVE(Op, DAG); + // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 217e971568a9..160cd18ca53b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1244,6 +1244,7 @@ private: SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll index 276f23703df3..20659cde83ee 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -140,98 +140,65 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_128_NOMAX-NEXT: ldr d0, [x1] -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 ; SVE2_128_NOMAX-NEXT: ldr d1, [x0] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 ; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_128_NOMAX-NEXT: fmov w8, s3 ; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_128_NOMAX-NEXT: fmov w8, s3 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_128_NOMAX-NEXT: fmov w9, s0 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_128_NOMAX-NEXT: ret ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 ; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 ; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 ; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_NOMIN_NOMAX-NEXT: ret ; ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops: ; SVE2_MIN_256_NOMAX: // %bb.0: -; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7] -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6] -; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 ; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 ; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 ; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14] -; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7] +; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -263,89 +230,59 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_128_NOMAX-NEXT: ldr d0, [x1] -; SVE2_128_NOMAX-NEXT: ldr d3, [x0] -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_128_NOMAX-NEXT: fmov w8, s1 -; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_128_NOMAX-NEXT: fmov w9, s2 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_128_NOMAX-NEXT: fmov w8, s0 -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_128_NOMAX-NEXT: fmov w9, s1 -; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: ldr d1, [x0] +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_128_NOMAX-NEXT: ret ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] -; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0] -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 -; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0 -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1 -; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0] +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_NOMIN_NOMAX-NEXT: ret ; ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value: ; SVE2_MIN_256_NOMAX: // %bb.0: -; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 -; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 ; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] -; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0] -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6] -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2] -; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 -; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11] -; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0 -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10] -; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1 -; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9] -; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] -; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b +; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h +; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -401,34 +338,23 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) { define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) { ; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: mov z1.b, z0.b[7] -; CHECK-NEXT: mov z2.b, z0.b[6] -; CHECK-NEXT: mov z3.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: mov z2.b, z0.b[3] -; CHECK-NEXT: mov z1.b, z1.b[1] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z3.b, z0.b[2] -; CHECK-NEXT: strb w9, [sp, #14] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strb w9, [sp, #11] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w9, [sp, #9] -; CHECK-NEXT: strb w8, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z4.b, z0.b[1] +; CHECK-NEXT: mov z1.b, z1.b[1] +; CHECK-NEXT: mov z5.b, z0.b[7] +; CHECK-NEXT: mov z6.b, z0.b[6] +; CHECK-NEXT: mov z0.b, z0.b[4] +; CHECK-NEXT: zip1 z2.b, z3.b, z2.b +; CHECK-NEXT: zip1 z1.b, z1.b, z4.b +; CHECK-NEXT: zip1 z3.b, z6.b, z5.b +; CHECK-NEXT: zip1 z0.b, z0.b, z0.b +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll index 617b560713c3..478072d33d8c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll @@ -184,13 +184,11 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind { define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind { ; CHECK-LABEL: vls_sve_and_2xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: stp wzr, w8, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index b9264ad5f77c..6644be11a02b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -91,19 +91,12 @@ define void @bitcast_v32i8(ptr %a, ptr %b) { define void @bitcast_v2i16(ptr %a, ptr %b) { ; CHECK-LABEL: bitcast_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: str w8, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: bitcast_v2i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll index b8a2e0e0f4bd..9729a1d95cd9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll @@ -222,3 +222,255 @@ define void @build_vector_no_stride_v4f64(ptr %a) { store <4 x double> , ptr %a, align 8 ret void } + +define void @build_vector_non_const_v4i1(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: orr w8, w0, w1, lsl #1 +; CHECK-NEXT: orr w8, w8, w2, lsl #2 +; CHECK-NEXT: orr w8, w8, w3, lsl #3 +; CHECK-NEXT: strb w8, [x4] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4i1: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: orr w8, w0, w1, lsl #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w2, lsl #2 +; NONEON-NOSVE-NEXT: orr w8, w8, w3, lsl #3 +; NONEON-NOSVE-NEXT: strb w8, [x4] +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x i1> undef, i1 %a, i64 0 + %2 = insertelement <4 x i1> %1, i1 %b, i64 1 + %3 = insertelement <4 x i1> %2, i1 %c, i64 2 + %4 = insertelement <4 x i1> %3, i1 %d, i64 3 + store <4 x i1> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v2f64(double %a, double %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x double> undef, double %a, i64 0 + %2 = insertelement <2 x double> %1, double %b, i64 1 + store <2 x double> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v2f32(float %a, float %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x float> undef, float %a, i64 0 + %2 = insertelement <2 x float> %1, float %b, i64 1 + store <2 x float> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v4f32(float %a, float %b, float %c, float %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: // kill: def $s3 killed $s3 def $z3 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $z1 +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.d, z0.d, z2.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4f32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s2, s3, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x float> undef, float %a, i64 0 + %2 = insertelement <4 x float> %1, float %b, i64 1 + %3 = insertelement <4 x float> %2, float %c, i64 2 + %4 = insertelement <4 x float> %3, float %d, i64 3 + store <4 x float> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v4f64(double %a, double %b, double %c, double %d, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d3 killed $d3 def $z3 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v4f64: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d2, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <4 x double> undef, double %a, i64 0 + %2 = insertelement <4 x double> %1, double %b, i64 1 + %3 = insertelement <4 x double> %2, double %c, i64 2 + %4 = insertelement <4 x double> %3, double %d, i64 3 + store <4 x double> %4, ptr %out + ret void +} + +define void @build_vector_non_const_v8f16(half %a, half %b, half %c, half %d, half %e, half %f, half %g, half %h, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h6 killed $h6 def $z6 +; CHECK-NEXT: // kill: def $h4 killed $h4 def $z4 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: // kill: def $h7 killed $h7 def $z7 +; CHECK-NEXT: // kill: def $h5 killed $h5 def $z5 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $z3 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $z1 +; CHECK-NEXT: zip1 z6.h, z6.h, z7.h +; CHECK-NEXT: zip1 z4.h, z4.h, z5.h +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: zip1 z1.s, z4.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v8f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h7, [sp, #14] +; NONEON-NOSVE-NEXT: str h6, [sp, #12] +; NONEON-NOSVE-NEXT: str h5, [sp, #10] +; NONEON-NOSVE-NEXT: str h4, [sp, #8] +; NONEON-NOSVE-NEXT: str h3, [sp, #6] +; NONEON-NOSVE-NEXT: str h2, [sp, #4] +; NONEON-NOSVE-NEXT: str h1, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %c, i64 2 + %4 = insertelement <8 x half> %3, half %d, i64 3 + %5 = insertelement <8 x half> %4, half %e, i64 4 + %6 = insertelement <8 x half> %5, half %f, i64 5 + %7 = insertelement <8 x half> %6, half %g, i64 6 + %8 = insertelement <8 x half> %7, half %h, i64 7 + store <8 x half> %8, ptr %out + ret void +} + +define void @build_vector_non_const_v2i32(i32 %a, i32 %b, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: str d0, [x2] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v2i32: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <2 x i32> undef, i32 %a, i64 0 + %2 = insertelement <2 x i32> %1, i32 %b, i64 1 + store <2 x i32> %2, ptr %out + ret void +} + +define void @build_vector_non_const_v8i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h, ptr %out) { +; CHECK-LABEL: build_vector_non_const_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: strb w7, [sp, #15] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: strb w6, [sp, #14] +; CHECK-NEXT: strb w5, [sp, #13] +; CHECK-NEXT: strb w4, [sp, #12] +; CHECK-NEXT: strb w3, [sp, #11] +; CHECK-NEXT: strb w2, [sp, #10] +; CHECK-NEXT: strb w1, [sp, #9] +; CHECK-NEXT: strb w0, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: build_vector_non_const_v8i8: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w7, [sp, #15] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w6, [sp, #14] +; NONEON-NOSVE-NEXT: strb w5, [sp, #13] +; NONEON-NOSVE-NEXT: strb w4, [sp, #12] +; NONEON-NOSVE-NEXT: strb w3, [sp, #11] +; NONEON-NOSVE-NEXT: strb w2, [sp, #10] +; NONEON-NOSVE-NEXT: strb w1, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ret + %1 = insertelement <8 x i8> undef, i8 %a, i64 0 + %2 = insertelement <8 x i8> %1, i8 %b, i64 1 + %3 = insertelement <8 x i8> %2, i8 %c, i64 2 + %4 = insertelement <8 x i8> %3, i8 %d, i64 3 + %5 = insertelement <8 x i8> %4, i8 %e, i64 4 + %6 = insertelement <8 x i8> %5, i8 %f, i64 5 + %7 = insertelement <8 x i8> %6, i8 %g, i64 6 + %8 = insertelement <8 x i8> %7, i8 %h, i64 7 + store <8 x i8> %8, ptr %out + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index 4b6285b2732f..c1810c678ea5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -12,34 +12,22 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: concat_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: mov z2.h, z1.h[3] -; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z2.h, z1.h[3] ; CHECK-NEXT: mov z3.h, z1.h[2] -; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z0.h[3] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strb w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w9, [sp, #14] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z6.h, z0.h[2] +; CHECK-NEXT: mov z7.h, z0.h[1] +; CHECK-NEXT: zip1 z2.b, z3.b, z2.b +; CHECK-NEXT: zip1 z1.b, z1.b, z4.b +; CHECK-NEXT: zip1 z3.b, z6.b, z5.b +; CHECK-NEXT: zip1 z0.b, z0.b, z7.b +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v8i8: @@ -152,22 +140,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) { define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: concat_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z1.s[1] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v4i16: @@ -428,18 +408,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) { define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { ; CHECK-LABEL: concat_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: str h1, [sp, #12] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h2, [sp, #14] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v4f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 50a05cb4b1e2..7d6336a43a4f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -326,29 +326,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_sext_v2i64i256: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: asr x9, x8, #63 -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: asr x8, x10, #63 -; CHECK-NEXT: mov z0.d, x9 -; CHECK-NEXT: stp x10, x8, [sp, #16] -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ldp q2, q4, [sp], #32 -; CHECK-NEXT: mov z3.d, z0.d[1] -; CHECK-NEXT: mov z5.d, z1.d[1] -; CHECK-NEXT: mov z6.d, z2.d[1] -; CHECK-NEXT: fmov x2, d0 -; CHECK-NEXT: mov z0.d, z4.d[1] -; CHECK-NEXT: fmov x6, d1 -; CHECK-NEXT: fmov x0, d2 -; CHECK-NEXT: fmov x4, d4 -; CHECK-NEXT: fmov x3, d3 -; CHECK-NEXT: fmov x7, d5 -; CHECK-NEXT: fmov x1, d6 -; CHECK-NEXT: fmov x5, d0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: asr x8, x8, #63 +; CHECK-NEXT: fmov d3, x8 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: asr x9, x9, #63 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d +; CHECK-NEXT: mov z3.d, x9 +; CHECK-NEXT: fmov x2, d2 +; CHECK-NEXT: zip1 z1.d, z1.d, z4.d +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: mov z5.d, z0.d[1] +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: fmov x6, d3 +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x3, d4 +; CHECK-NEXT: fmov x1, d5 +; CHECK-NEXT: fmov x4, d1 +; CHECK-NEXT: fmov x7, d6 +; CHECK-NEXT: fmov x5, d2 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_sext_v2i64i256: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index 266569630846..a728cbe97056 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -10,23 +10,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { ; CHECK-LABEL: extract_subvector_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i1: @@ -53,23 +45,15 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { ; CHECK-LABEL: extract_subvector_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i8: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index dad53b31db0b..f1771a753826 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -1126,49 +1126,39 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v4f16_v4f64: ; SVE: // %bb.0: -; SVE-NEXT: sub sp, sp, #16 -; SVE-NEXT: .cfi_def_cfa_offset 16 -; SVE-NEXT: ldp q1, q0, [x1] -; SVE-NEXT: ldr d4, [x0] -; SVE-NEXT: and z4.h, z4.h, #0x7fff -; SVE-NEXT: mov z2.d, z0.d[1] -; SVE-NEXT: mov z3.d, z1.d[1] -; SVE-NEXT: fcvt h0, d0 +; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: mov z2.d, z1.d[1] +; SVE-NEXT: mov z3.d, z0.d[1] ; SVE-NEXT: fcvt h1, d1 +; SVE-NEXT: fcvt h0, d0 ; SVE-NEXT: fcvt h2, d2 ; SVE-NEXT: fcvt h3, d3 -; SVE-NEXT: str h0, [sp, #12] -; SVE-NEXT: str h1, [sp, #8] -; SVE-NEXT: str h2, [sp, #14] -; SVE-NEXT: str h3, [sp, #10] -; SVE-NEXT: ldr d0, [sp, #8] +; SVE-NEXT: zip1 z1.h, z1.h, z2.h +; SVE-NEXT: zip1 z0.h, z0.h, z3.h +; SVE-NEXT: zip1 z0.s, z0.s, z1.s +; SVE-NEXT: ldr d1, [x0] +; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: and z0.h, z0.h, #0x8000 -; SVE-NEXT: orr z0.d, z4.d, z0.d +; SVE-NEXT: orr z0.d, z1.d, z0.d ; SVE-NEXT: str d0, [x0] -; SVE-NEXT: add sp, sp, #16 ; SVE-NEXT: ret ; ; SVE2-LABEL: test_copysign_v4f16_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: sub sp, sp, #16 -; SVE2-NEXT: .cfi_def_cfa_offset 16 -; SVE2-NEXT: ldp q2, q1, [x1] -; SVE2-NEXT: mov z0.h, #32767 // =0x7fff -; SVE2-NEXT: ldr d5, [x0] -; SVE2-NEXT: mov z3.d, z1.d[1] -; SVE2-NEXT: mov z4.d, z2.d[1] +; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: mov z2.d, z1.d[1] +; SVE2-NEXT: mov z3.d, z0.d[1] ; SVE2-NEXT: fcvt h1, d1 +; SVE2-NEXT: fcvt h0, d0 ; SVE2-NEXT: fcvt h2, d2 ; SVE2-NEXT: fcvt h3, d3 -; SVE2-NEXT: fcvt h4, d4 -; SVE2-NEXT: str h1, [sp, #12] -; SVE2-NEXT: str h2, [sp, #8] -; SVE2-NEXT: str h3, [sp, #14] -; SVE2-NEXT: str h4, [sp, #10] -; SVE2-NEXT: ldr d1, [sp, #8] -; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d -; SVE2-NEXT: str d5, [x0] -; SVE2-NEXT: add sp, sp, #16 +; SVE2-NEXT: zip1 z1.h, z1.h, z2.h +; SVE2-NEXT: zip1 z0.h, z0.h, z3.h +; SVE2-NEXT: mov z2.h, #32767 // =0x7fff +; SVE2-NEXT: zip1 z0.s, z0.s, z1.s +; SVE2-NEXT: ldr d1, [x0] +; SVE2-NEXT: bsl z1.d, z1.d, z0.d, z2.d +; SVE2-NEXT: str d1, [x0] ; SVE2-NEXT: ret ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index a206fbc51029..11fee267660c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -443,9 +443,10 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: mov z1.h, z0.h[1] ; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: stp x8, x9, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64: @@ -471,19 +472,20 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fcvtzu x10, h2 -; CHECK-NEXT: fcvtzu x11, h0 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: fcvtzu x10, h0 +; CHECK-NEXT: fcvtzu x8, h1 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fcvtzu x11, h3 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: @@ -521,31 +523,35 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtzu x12, h0 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: fcvtzu x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: fcvtzu x9, h2 -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: fcvtzu x10, h3 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x11, h1 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-64]! -; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: fcvtzu x8, h2 ; CHECK-NEXT: fcvtzu x9, h3 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: ldp q2, q3, [sp] -; CHECK-NEXT: stp x12, x8, [sp, #32] -; CHECK-NEXT: stp x10, x9, [sp, #48] -; CHECK-NEXT: ldp q1, q0, [sp, #32] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fcvtzu x14, h1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: fcvtzu x12, h5 +; CHECK-NEXT: fcvtzu x13, h6 +; CHECK-NEXT: fcvtzu x15, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d4, x13 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: fmov d3, x14 +; CHECK-NEXT: zip1 z1.d, z4.d, z1.d +; CHECK-NEXT: fmov d4, x15 +; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: zip1 z3.d, z3.d, z4.d +; CHECK-NEXT: stp q3, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: @@ -598,57 +604,67 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: fcvtzu x9, h1 +; CHECK-NEXT: fcvtzu x8, h0 +; CHECK-NEXT: mov z7.h, z0.h[1] ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: mov z4.h, z2.h[1] -; CHECK-NEXT: fcvtzu x8, h2 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[2] -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: fcvtzu x9, h4 -; CHECK-NEXT: mov z4.h, z3.h[1] -; CHECK-NEXT: fcvtzu x10, h5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: mov z2.h, z3.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-128]! -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: fcvtzu x8, h4 -; CHECK-NEXT: fcvtzu x9, h5 -; CHECK-NEXT: stp x11, x10, [sp, #16] +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 ; CHECK-NEXT: fcvtzu x10, h2 -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzu x11, h1 +; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: fcvtzu x12, h6 ; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: stp x12, x8, [sp, #64] -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: fcvtzu x8, h4 -; CHECK-NEXT: stp x10, x9, [sp, #80] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x10, h0 ; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x11, x12, [sp, #32] -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: stp x9, x8, [sp, #48] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: ldp q3, q4, [sp, #64] -; CHECK-NEXT: stp x10, x11, [sp, #96] -; CHECK-NEXT: ldp q6, q7, [sp, #32] -; CHECK-NEXT: stp x8, x12, [sp, #112] -; CHECK-NEXT: ldp q5, q2, [sp, #96] -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: stp q6, q7, [x1] -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: stp q5, q2, [x1, #64] -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: fmov d16, x9 +; CHECK-NEXT: mov z2.h, z3.h[3] +; CHECK-NEXT: mov z4.h, z5.h[3] +; CHECK-NEXT: fcvtzu x14, h3 +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: fcvtzu x15, h5 +; CHECK-NEXT: mov z1.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z5.h[1] +; CHECK-NEXT: mov z5.h, z5.h[2] +; CHECK-NEXT: mov z3.h, z3.h[2] +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fcvtzu x10, h4 +; CHECK-NEXT: fmov d4, x11 +; CHECK-NEXT: fcvtzu x11, h7 +; CHECK-NEXT: fmov d7, x12 +; CHECK-NEXT: fcvtzu x12, h0 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: fcvtzu x14, h6 +; CHECK-NEXT: fmov d6, x15 +; CHECK-NEXT: fcvtzu x15, h5 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fcvtzu x9, h3 +; CHECK-NEXT: zip1 z4.d, z16.d, z4.d +; CHECK-NEXT: fmov d16, x8 +; CHECK-NEXT: zip1 z0.d, z0.d, z7.d +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d7, x10 +; CHECK-NEXT: stp q4, q0, [x1, #64] +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z4.d, z4.d, z5.d +; CHECK-NEXT: zip1 z3.d, z16.d, z3.d +; CHECK-NEXT: fmov d16, x15 +; CHECK-NEXT: stp q3, q2, [x1] +; CHECK-NEXT: fmov d2, x13 +; CHECK-NEXT: zip1 z7.d, z16.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: stp q1, q4, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: @@ -1216,26 +1232,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) { define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: mov z2.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16: @@ -1270,40 +1278,29 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: mov z7.s, z2.s[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z1.h, z1.h, z5.h +; CHECK-NEXT: zip1 z3.h, z3.h, z6.h +; CHECK-NEXT: zip1 z2.h, z2.h, z7.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: zip1 z1.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16: @@ -1360,73 +1357,50 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q5, q6, [x0, #96] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: ldp q0, q4, [x0, #32] +; CHECK-NEXT: ldp q2, q7, [x0, #64] +; CHECK-NEXT: ldp q1, q3, [x0] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d -; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d -; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: movprfx z3, z7 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s -; CHECK-NEXT: mov z3.s, z5.s[1] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: mov z17.s, z6.s[1] +; CHECK-NEXT: mov z16.s, z4.s[1] +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: mov z21.s, z0.s[1] +; CHECK-NEXT: mov z19.s, z7.s[1] +; CHECK-NEXT: mov z20.s, z2.s[1] +; CHECK-NEXT: mov z22.s, z3.s[1] +; CHECK-NEXT: mov z23.s, z1.s[1] +; CHECK-NEXT: zip1 z6.h, z6.h, z17.h +; CHECK-NEXT: zip1 z4.h, z4.h, z16.h +; CHECK-NEXT: zip1 z5.h, z5.h, z18.h +; CHECK-NEXT: zip1 z0.h, z0.h, z21.h +; CHECK-NEXT: zip1 z7.h, z7.h, z19.h +; CHECK-NEXT: zip1 z2.h, z2.h, z20.h +; CHECK-NEXT: zip1 z3.h, z3.h, z22.h +; CHECK-NEXT: zip1 z1.h, z1.h, z23.h +; CHECK-NEXT: zip1 z5.s, z5.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z4.s +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.d, z2.d, z5.d +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16: @@ -2187,9 +2161,10 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; CHECK-NEXT: mov z1.h, z0.h[1] ; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: stp x8, x9, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64: @@ -2215,19 +2190,20 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v4f16_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x10, h2 -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: stp x8, x9, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: fcvtzs x10, h0 +; CHECK-NEXT: fcvtzs x8, h1 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fcvtzs x11, h3 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x11 +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d ; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: @@ -2265,31 +2241,35 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z3.h, z0.h[2] +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: fcvtzs x10, h0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: mov z3.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: fcvtzs x9, h2 -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: fcvtzs x10, h3 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x11, h1 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-64]! -; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: fcvtzs x8, h2 ; CHECK-NEXT: fcvtzs x9, h3 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: ldp q2, q3, [sp] -; CHECK-NEXT: stp x12, x8, [sp, #32] -; CHECK-NEXT: stp x10, x9, [sp, #48] -; CHECK-NEXT: ldp q1, q0, [sp, #32] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fcvtzs x14, h1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: fcvtzs x12, h5 +; CHECK-NEXT: fcvtzs x13, h6 +; CHECK-NEXT: fcvtzs x15, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: fmov d1, x12 +; CHECK-NEXT: fmov d4, x13 +; CHECK-NEXT: zip1 z2.d, z2.d, z3.d +; CHECK-NEXT: fmov d3, x14 +; CHECK-NEXT: zip1 z1.d, z4.d, z1.d +; CHECK-NEXT: fmov d4, x15 +; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: zip1 z3.d, z3.d, z4.d +; CHECK-NEXT: stp q3, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: @@ -2342,57 +2322,67 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f16_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z1.h[1] +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: fcvtzs x9, h1 +; CHECK-NEXT: fcvtzs x8, h0 +; CHECK-NEXT: mov z7.h, z0.h[1] ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: mov z4.h, z2.h[1] -; CHECK-NEXT: fcvtzs x8, h2 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[2] -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: fcvtzs x9, h4 -; CHECK-NEXT: mov z4.h, z3.h[1] -; CHECK-NEXT: fcvtzs x10, h5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: mov z2.h, z3.h[2] -; CHECK-NEXT: stp x8, x9, [sp, #-128]! -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: fcvtzs x8, h4 -; CHECK-NEXT: fcvtzs x9, h5 -; CHECK-NEXT: stp x11, x10, [sp, #16] +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 ; CHECK-NEXT: fcvtzs x10, h2 -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: fcvtzs x11, h1 +; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: fcvtzs x12, h6 ; CHECK-NEXT: mov z1.h, z1.h[2] -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: stp x12, x8, [sp, #64] -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: fcvtzs x8, h4 -; CHECK-NEXT: stp x10, x9, [sp, #80] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x10, h0 ; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: stp x11, x12, [sp, #32] -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: stp x9, x8, [sp, #48] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: ldp q3, q4, [sp, #64] -; CHECK-NEXT: stp x10, x11, [sp, #96] -; CHECK-NEXT: ldp q6, q7, [sp, #32] -; CHECK-NEXT: stp x8, x12, [sp, #112] -; CHECK-NEXT: ldp q5, q2, [sp, #96] -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: stp q6, q7, [x1] -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: stp q5, q2, [x1, #64] -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: fmov d16, x9 +; CHECK-NEXT: mov z2.h, z3.h[3] +; CHECK-NEXT: mov z4.h, z5.h[3] +; CHECK-NEXT: fcvtzs x14, h3 +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: fcvtzs x15, h5 +; CHECK-NEXT: mov z1.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z5.h[1] +; CHECK-NEXT: mov z5.h, z5.h[2] +; CHECK-NEXT: mov z3.h, z3.h[2] +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fmov d2, x10 +; CHECK-NEXT: fcvtzs x10, h4 +; CHECK-NEXT: fmov d4, x11 +; CHECK-NEXT: fcvtzs x11, h7 +; CHECK-NEXT: fmov d7, x12 +; CHECK-NEXT: fcvtzs x12, h0 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: fcvtzs x14, h6 +; CHECK-NEXT: fmov d6, x15 +; CHECK-NEXT: fcvtzs x15, h5 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fcvtzs x9, h3 +; CHECK-NEXT: zip1 z4.d, z16.d, z4.d +; CHECK-NEXT: fmov d16, x8 +; CHECK-NEXT: zip1 z0.d, z0.d, z7.d +; CHECK-NEXT: fmov d3, x12 +; CHECK-NEXT: fmov d7, x10 +; CHECK-NEXT: stp q4, q0, [x1, #64] +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: fmov d3, x11 +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z4.d, z4.d, z5.d +; CHECK-NEXT: zip1 z3.d, z16.d, z3.d +; CHECK-NEXT: fmov d16, x15 +; CHECK-NEXT: stp q3, q2, [x1] +; CHECK-NEXT: fmov d2, x13 +; CHECK-NEXT: zip1 z7.d, z16.d, z7.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: stp q1, q4, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: @@ -2962,26 +2952,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) { define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v4f64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: mov z2.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z1.s[1] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: zip1 z1.h, z1.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16: @@ -3016,40 +2998,29 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.s, z3.s[1] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: mov z7.s, z2.s[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z1.h, z1.h, z5.h +; CHECK-NEXT: zip1 z3.h, z3.h, z6.h +; CHECK-NEXT: zip1 z2.h, z2.h, z7.h +; CHECK-NEXT: zip1 z0.s, z1.s, z0.s +; CHECK-NEXT: zip1 z1.s, z2.s, z3.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16: @@ -3106,73 +3077,50 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q5, q6, [x0, #96] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: ldp q0, q4, [x0, #32] +; CHECK-NEXT: ldp q2, q7, [x0, #64] +; CHECK-NEXT: ldp q1, q3, [x0] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.d ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d -; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d -; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z16.s, z1.s[1] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z3.s[1] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: movprfx z3, z7 -; CHECK-NEXT: fcvtzs z3.d, p0/m, z7.d -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: uzp1 z1.s, z4.s, z4.s -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s -; CHECK-NEXT: mov z3.s, z5.s[1] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z2.s[1] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: mov z17.s, z6.s[1] +; CHECK-NEXT: mov z16.s, z4.s[1] +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: mov z21.s, z0.s[1] +; CHECK-NEXT: mov z19.s, z7.s[1] +; CHECK-NEXT: mov z20.s, z2.s[1] +; CHECK-NEXT: mov z22.s, z3.s[1] +; CHECK-NEXT: mov z23.s, z1.s[1] +; CHECK-NEXT: zip1 z6.h, z6.h, z17.h +; CHECK-NEXT: zip1 z4.h, z4.h, z16.h +; CHECK-NEXT: zip1 z5.h, z5.h, z18.h +; CHECK-NEXT: zip1 z0.h, z0.h, z21.h +; CHECK-NEXT: zip1 z7.h, z7.h, z19.h +; CHECK-NEXT: zip1 z2.h, z2.h, z20.h +; CHECK-NEXT: zip1 z3.h, z3.h, z22.h +; CHECK-NEXT: zip1 z1.h, z1.h, z23.h +; CHECK-NEXT: zip1 z5.s, z5.s, z6.s +; CHECK-NEXT: zip1 z0.s, z0.s, z4.s +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.d, z2.d, z5.d +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index 035c76b56929..ad5f91a5f39a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -8,25 +8,18 @@ target triple = "aarch64-unknown-linux-gnu" define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) { ; CHECK-LABEL: select_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: mov z3.s, z2.s[1] -; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d2, [sp, #8] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h ; CHECK-NEXT: lsl z2.h, z2.h, #15 ; CHECK-NEXT: asr z2.h, z2.h, #15 ; CHECK-NEXT: and z2.h, z2.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index d77473ed8f08..275d13ebfd94 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -506,14 +506,10 @@ define <4 x i64> @insertelement_v4i64(ptr %a) { define <2 x half> @insertelement_v2f16(<2 x half> %op1) { ; CHECK-LABEL: insertelement_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: fmov h1, #5.00000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: insertelement_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 0c712a15d4de..e595686cb497 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -1140,18 +1140,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-LABEL: ucvtf_v2i64_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] ; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: ucvtf h0, x8 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: ucvtf h1, x8 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ucvtf h1, x9 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16: @@ -2598,18 +2594,14 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; CHECK-LABEL: scvtf_v2i64_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d[1] ; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: scvtf h0, x8 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: scvtf h1, x8 -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: scvtf h1, x9 +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 270f05a806b8..613543310f2c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -10,25 +10,20 @@ declare void @def(ptr) define void @alloc_v4i8(ptr %st_ptr) nounwind { ; CHECK-LABEL: alloc_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #28 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: add x20, sp, #28 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x20, sp, #12 ; CHECK-NEXT: bl def ; CHECK-NEXT: ptrue p0.b, vl2 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: mov z2.b, z0.b[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s ; CHECK-NEXT: st1b { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: alloc_v4i8: @@ -62,32 +57,28 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind { define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-LABEL: alloc_v6i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x0, sp, #8 ; CHECK-NEXT: bl def -; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: add x8, sp, #4 ; CHECK-NEXT: ptrue p1.s, vl2 ; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: mov z2.b, z0.b[5] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: add x8, sp, #20 -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: st1b { z0.h }, p0, [x8] -; CHECK-NEXT: ld1h { z0.s }, p1/z, [x8] -; CHECK-NEXT: strb w9, [x19, #2] +; CHECK-NEXT: mov z2.b, z0.b[1] +; CHECK-NEXT: mov z0.b, z0.b[5] +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z1.s, z1.s, z0.s +; CHECK-NEXT: st1b { z1.h }, p0, [x8] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x19, #2] +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: alloc_v6i8: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 5f4b9dd1592c..9055b2efba32 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1466,23 +1466,18 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z1.h, z1.h, z1.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_v2f16: @@ -2318,33 +2313,21 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z1.b, z0.b[3] ; CHECK-NEXT: mov z2.b, z0.b[2] +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: mov z3.b, z0.b[1] ; CHECK-NEXT: mov z4.b, z0.b[7] -; CHECK-NEXT: strh w8, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.b, z0.b[6] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[5] -; CHECK-NEXT: mov z0.b, z0.b[4] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: ldp d0, d1, [sp] +; CHECK-NEXT: mov z5.b, z0.b[6] +; CHECK-NEXT: mov z6.b, z0.b[5] +; CHECK-NEXT: mov z7.b, z0.b[4] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z2.h, z5.h, z4.h +; CHECK-NEXT: zip1 z3.h, z7.h, z6.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: zip1 z1.s, z3.s, z2.s ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 @@ -2357,7 +2340,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_v8f32: @@ -2684,23 +2666,21 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-LABEL: masked_load_zext_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: fmov s0, w2 +; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: strh w1, [sp, #8] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.h, z1.h, z0.h +; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32: @@ -2759,23 +2739,21 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-LABEL: masked_load_sext_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: strh w3, [sp, #12] +; CHECK-NEXT: fmov s0, w2 +; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: strh w2, [sp, #10] -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: strh w1, [sp, #8] -; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.h, z1.h, z0.h +; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 0c3411e5f551..265480b57197 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -589,23 +589,18 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z1.h, z1.h, z1.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v2f16: @@ -1014,48 +1009,33 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-LABEL: masked_store_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.b, z0.b[7] ; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: mov z3.b, z0.b[5] ; CHECK-NEXT: mov z4.b, z0.b[4] +; CHECK-NEXT: mov z5.b, z0.b[3] +; CHECK-NEXT: mov z6.b, z0.b[2] +; CHECK-NEXT: mov z7.b, z0.b[1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[3] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[2] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z4.b, z0.b[1] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z2.h, z4.h, z3.h +; CHECK-NEXT: zip1 z3.h, z6.h, z5.h +; CHECK-NEXT: zip1 z0.h, z0.h, z7.h +; CHECK-NEXT: zip1 z1.s, z2.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr d0, [sp] -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v8f32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index b91f813c5141..8b296d9fbc21 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -9,65 +9,44 @@ target triple = "aarch64-unknown-linux-gnu" define void @zip1_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: mov z3.b, z0.b[14] -; CHECK-NEXT: mov z4.b, z0.b[13] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.b, z0.b[11] -; CHECK-NEXT: mov z2.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[10] -; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[8] -; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: mov z4.b, z0.b[14] +; CHECK-NEXT: mov z6.b, z0.b[13] +; CHECK-NEXT: mov z3.b, z1.b[15] +; CHECK-NEXT: mov z5.b, z1.b[14] +; CHECK-NEXT: mov z7.b, z1.b[13] +; CHECK-NEXT: mov z16.b, z0.b[12] +; CHECK-NEXT: mov z17.b, z1.b[12] +; CHECK-NEXT: mov z18.b, z0.b[11] +; CHECK-NEXT: mov z19.b, z1.b[11] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: mov z21.b, z1.b[10] +; CHECK-NEXT: mov z22.b, z0.b[9] +; CHECK-NEXT: mov z23.b, z1.b[9] +; CHECK-NEXT: mov z24.b, z0.b[8] +; CHECK-NEXT: mov z25.b, z1.b[8] +; CHECK-NEXT: zip1 z2.b, z2.b, z3.b +; CHECK-NEXT: zip1 z3.b, z4.b, z5.b +; CHECK-NEXT: zip1 z4.b, z6.b, z7.b +; CHECK-NEXT: zip1 z5.b, z16.b, z17.b +; CHECK-NEXT: zip1 z6.b, z18.b, z19.b +; CHECK-NEXT: zip1 z7.b, z20.b, z21.b +; CHECK-NEXT: zip1 z16.b, z22.b, z23.b ; CHECK-NEXT: zip1 z0.b, z0.b, z1.b -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[9] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[8] -; CHECK-NEXT: strb w9, [sp, #5] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z17.b, z24.b, z25.b +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z3.h, z5.h, z4.h +; CHECK-NEXT: zip1 z4.h, z7.h, z6.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z5.h, z17.h, z16.h +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v32i8: @@ -159,123 +138,97 @@ define void @zip1_v32i8(ptr %a, ptr %b) { define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-LABEL: zip_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: ldp q2, q5, [x0, #32] -; CHECK-NEXT: mov z16.h, z3.h[7] -; CHECK-NEXT: mov z18.h, z3.h[6] -; CHECK-NEXT: mov z17.h, z4.h[7] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: mov z19.h, z4.h[6] -; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z5.h, z1.h[7] +; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: mov z17.h, z1.h[5] +; CHECK-NEXT: mov z4.h, z3.h[7] +; CHECK-NEXT: mov z6.h, z3.h[6] ; CHECK-NEXT: mov z16.h, z3.h[5] -; CHECK-NEXT: fmov w9, s17 -; CHECK-NEXT: mov z17.h, z4.h[5] -; CHECK-NEXT: mov z20.h, z7.h[6] -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z0.h[7] ; CHECK-NEXT: mov z18.h, z3.h[4] -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z5.h[7] -; CHECK-NEXT: zip1 z3.h, z4.h, z3.h -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z4.h[4] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: zip1 z4.h, z5.h, z7.h -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[7] -; CHECK-NEXT: add z3.h, z3.h, z4.h -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z0.h[7] -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z1.h[6] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #60] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z1.h[4] -; CHECK-NEXT: strh w8, [sp, #56] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z0.h[4] -; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: zip1 z1.h, z2.h, z6.h -; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: ldr q16, [sp, #16] -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #52] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[7] -; CHECK-NEXT: strh w8, [sp, #48] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z5.h[6] -; CHECK-NEXT: ldr q17, [sp, #48] -; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z7.h[5] -; CHECK-NEXT: strh w8, [sp, #44] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[5] -; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z7.h[4] -; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[4] -; CHECK-NEXT: strh w8, [sp, #38] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[7] -; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[7] -; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[6] -; CHECK-NEXT: strh w8, [sp, #32] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z2.h[6] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z6.h[5] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z2.h[5] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[4] -; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z2.h[4] -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: ldr q2, [sp, #32] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: fmov w9, s18 -; CHECK-NEXT: add z2.h, z16.h, z2.h -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: ldr q4, [sp] -; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: add z1.h, z17.h, z4.h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: mov z19.h, z1.h[4] +; CHECK-NEXT: mov z22.h, z2.h[6] +; CHECK-NEXT: mov z23.h, z0.h[6] +; CHECK-NEXT: zip1 z24.h, z5.h, z4.h +; CHECK-NEXT: zip1 z25.h, z7.h, z6.h +; CHECK-NEXT: zip1 z17.h, z17.h, z16.h +; CHECK-NEXT: ldp q4, q6, [x0, #32] +; CHECK-NEXT: zip1 z16.h, z21.h, z20.h +; CHECK-NEXT: ldp q5, q7, [x1, #32] +; CHECK-NEXT: zip1 z18.h, z19.h, z18.h +; CHECK-NEXT: zip1 z19.s, z25.s, z24.s +; CHECK-NEXT: zip1 z22.h, z23.h, z22.h +; CHECK-NEXT: mov z23.h, z2.h[5] +; CHECK-NEXT: mov z21.h, z6.h[7] +; CHECK-NEXT: mov z24.h, z0.h[5] +; CHECK-NEXT: mov z25.h, z2.h[4] +; CHECK-NEXT: mov z20.h, z7.h[7] +; CHECK-NEXT: mov z26.h, z0.h[4] +; CHECK-NEXT: mov z27.h, z6.h[6] +; CHECK-NEXT: mov z28.h, z7.h[5] +; CHECK-NEXT: mov z29.h, z6.h[5] +; CHECK-NEXT: mov z30.h, z7.h[4] +; CHECK-NEXT: mov z31.h, z6.h[4] +; CHECK-NEXT: mov z8.h, z5.h[7] +; CHECK-NEXT: mov z9.h, z4.h[7] +; CHECK-NEXT: zip1 z20.h, z21.h, z20.h +; CHECK-NEXT: mov z21.h, z7.h[6] +; CHECK-NEXT: mov z10.h, z5.h[6] +; CHECK-NEXT: mov z11.h, z4.h[6] +; CHECK-NEXT: mov z12.h, z5.h[5] +; CHECK-NEXT: mov z13.h, z4.h[5] +; CHECK-NEXT: mov z14.h, z5.h[4] +; CHECK-NEXT: mov z15.h, z4.h[4] +; CHECK-NEXT: zip1 z23.h, z24.h, z23.h +; CHECK-NEXT: zip1 z21.h, z27.h, z21.h +; CHECK-NEXT: zip1 z27.h, z29.h, z28.h +; CHECK-NEXT: zip1 z28.h, z31.h, z30.h +; CHECK-NEXT: zip1 z24.h, z26.h, z25.h +; CHECK-NEXT: zip1 z25.h, z9.h, z8.h +; CHECK-NEXT: zip1 z26.h, z11.h, z10.h +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z29.h, z13.h, z12.h +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z30.h, z15.h, z14.h +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z17.s, z18.s, z17.s +; CHECK-NEXT: zip1 z18.s, z21.s, z20.s +; CHECK-NEXT: zip1 z20.s, z28.s, z27.s +; CHECK-NEXT: zip1 z16.s, z22.s, z16.s +; CHECK-NEXT: zip1 z21.s, z24.s, z23.s +; CHECK-NEXT: zip1 z1.h, z1.h, z3.h +; CHECK-NEXT: zip1 z3.s, z26.s, z25.s +; CHECK-NEXT: zip1 z22.s, z30.s, z29.s +; CHECK-NEXT: zip1 z6.h, z6.h, z7.h +; CHECK-NEXT: zip1 z7.d, z17.d, z19.d +; CHECK-NEXT: zip1 z17.d, z20.d, z18.d +; CHECK-NEXT: zip1 z0.h, z0.h, z2.h +; CHECK-NEXT: zip1 z2.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.d, z21.d, z16.d +; CHECK-NEXT: zip1 z3.d, z22.d, z3.d +; CHECK-NEXT: add z1.h, z1.h, z6.h +; CHECK-NEXT: add z5.h, z7.h, z17.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: add z2.h, z4.h, z3.h +; CHECK-NEXT: stp q1, q5, [x0, #32] +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_v32i16: @@ -436,41 +389,28 @@ define void @zip_v32i16(ptr %a, ptr %b) { define void @zip1_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: mov z3.h, z0.h[6] -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[4] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z0.h[6] +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z0.h[4] +; CHECK-NEXT: mov z17.h, z1.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z3.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z5.h, z16.h, z17.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v16i16: @@ -530,8 +470,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) { define void @zip1_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: zip1_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1, #16] @@ -539,18 +477,13 @@ define void @zip1_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z3.s, z4.s, z5.s ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip1_v8i32: @@ -636,25 +569,18 @@ define void @zip_v4f64(ptr %a, ptr %b) { define void @zip_v4i32(ptr %a, ptr %b) { ; CHECK-LABEL: zip_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z3.s, z1.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z1.s, z0.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w9, w8, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip_v4i32: @@ -1209,65 +1135,44 @@ define void @trn_v8i32_undef(ptr %a) { define void @zip2_v32i8(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: mov z3.b, z0.b[14] -; CHECK-NEXT: mov z4.b, z0.b[13] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.b, z0.b[11] -; CHECK-NEXT: mov z2.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[10] -; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[8] -; CHECK-NEXT: strb w9, [sp, #8] +; CHECK-NEXT: mov z4.b, z0.b[14] +; CHECK-NEXT: mov z6.b, z0.b[13] +; CHECK-NEXT: mov z3.b, z1.b[15] +; CHECK-NEXT: mov z5.b, z1.b[14] +; CHECK-NEXT: mov z7.b, z1.b[13] +; CHECK-NEXT: mov z16.b, z0.b[12] +; CHECK-NEXT: mov z17.b, z1.b[12] +; CHECK-NEXT: mov z18.b, z0.b[11] +; CHECK-NEXT: mov z19.b, z1.b[11] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: mov z21.b, z1.b[10] +; CHECK-NEXT: mov z22.b, z0.b[9] +; CHECK-NEXT: mov z23.b, z1.b[9] +; CHECK-NEXT: mov z24.b, z0.b[8] +; CHECK-NEXT: mov z25.b, z1.b[8] +; CHECK-NEXT: zip1 z2.b, z2.b, z3.b +; CHECK-NEXT: zip1 z3.b, z4.b, z5.b +; CHECK-NEXT: zip1 z4.b, z6.b, z7.b +; CHECK-NEXT: zip1 z5.b, z16.b, z17.b +; CHECK-NEXT: zip1 z6.b, z18.b, z19.b +; CHECK-NEXT: zip1 z7.b, z20.b, z21.b +; CHECK-NEXT: zip1 z16.b, z22.b, z23.b ; CHECK-NEXT: zip1 z0.b, z0.b, z1.b -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[9] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[8] -; CHECK-NEXT: strb w9, [sp, #5] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z17.b, z24.b, z25.b +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z3.h, z5.h, z4.h +; CHECK-NEXT: zip1 z4.h, z7.h, z6.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z5.h, z17.h, z16.h +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v32i8: @@ -1359,41 +1264,28 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{ define void @zip2_v16i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: mov z3.h, z0.h[6] -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[4] -; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.h, z0.h[6] +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: mov z3.h, z1.h[7] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z0.h[4] +; CHECK-NEXT: mov z17.h, z1.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z1.h[5] -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.h, z2.h, z3.h +; CHECK-NEXT: zip1 z3.h, z4.h, z5.h +; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z5.h, z16.h, z17.h ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v16i16: @@ -1453,8 +1345,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; CHECK-LABEL: zip2_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1] @@ -1462,18 +1352,13 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z2.s, z0.s[3] ; CHECK-NEXT: mov z4.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z1.s[2] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] +; CHECK-NEXT: zip1 z2.s, z2.s, z3.s +; CHECK-NEXT: zip1 z3.s, z4.s, z5.s ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zip2_v8i32: @@ -1547,197 +1432,139 @@ define void @zip2_v8i32_undef(ptr %a) #0{ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov z4.b, z3.b[14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z6.b, z3.b[10] -; CHECK-NEXT: mov z5.b, z3.b[12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z7.b, z3.b[8] -; CHECK-NEXT: mov z17.b, z3.b[9] -; CHECK-NEXT: mov z18.b, z3.b[7] -; CHECK-NEXT: mov z16.b, z3.b[11] -; CHECK-NEXT: strb w8, [sp, #40] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z3.b[6] -; CHECK-NEXT: strb w9, [sp, #32] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.b, z3.b[4] -; CHECK-NEXT: strb w8, [sp, #47] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[2] -; CHECK-NEXT: strb w9, [sp, #46] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: mov z7.b, z2.b[14] -; CHECK-NEXT: strb w8, [sp, #45] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z2.b[12] -; CHECK-NEXT: strb w9, [sp, #44] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: mov z16.b, z2.b[11] -; CHECK-NEXT: strb w8, [sp, #43] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z2.b[10] -; CHECK-NEXT: strb w9, [sp, #61] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: strb w8, [sp, #42] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z2.b[8] -; CHECK-NEXT: strb w9, [sp, #53] -; CHECK-NEXT: strb w8, [sp, #41] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[6] -; CHECK-NEXT: strb w8, [sp, #39] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z2.b[4] -; CHECK-NEXT: strb w8, [sp, #38] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z2.b[2] -; CHECK-NEXT: strb w8, [sp, #37] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[10] -; CHECK-NEXT: strb w8, [sp, #36] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[8] -; CHECK-NEXT: strb w8, [sp, #35] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[14] -; CHECK-NEXT: strb w8, [sp, #34] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z1.b[12] -; CHECK-NEXT: strb w8, [sp, #33] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [sp, #8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z1.b[6] -; CHECK-NEXT: strb w8, [sp, #15] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z1.b[4] -; CHECK-NEXT: strb w8, [sp, #14] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[2] -; CHECK-NEXT: strb w8, [sp, #13] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z0.b[14] -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[12] -; CHECK-NEXT: strb w8, [sp, #11] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z0.b[10] -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[8] -; CHECK-NEXT: strb w8, [sp, #9] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z0.b[6] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.b, z0.b[4] -; CHECK-NEXT: strb w8, [sp, #6] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.b, z0.b[2] -; CHECK-NEXT: strb w8, [sp, #5] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[15] -; CHECK-NEXT: strb w8, [sp, #4] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z3.b[13] -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: ldr q4, [sp, #32] -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z3.b[5] -; CHECK-NEXT: mov z3.b, z3.b[3] -; CHECK-NEXT: ldr q5, [sp] -; CHECK-NEXT: strb w8, [sp, #63] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[13] -; CHECK-NEXT: strb w8, [sp, #62] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: strb w8, [sp, #60] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: strb w8, [sp, #59] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z2.b[9] -; CHECK-NEXT: strb w8, [sp, #58] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z2.b[5] -; CHECK-NEXT: strb w8, [sp, #57] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z2.b[3] +; CHECK-NEXT: stp d13, d12, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z2.b, z1.b[14] +; CHECK-NEXT: mov z3.b, z1.b[12] +; CHECK-NEXT: mov z4.b, z1.b[10] +; CHECK-NEXT: mov z5.b, z1.b[8] +; CHECK-NEXT: mov z6.b, z1.b[6] +; CHECK-NEXT: mov z7.b, z1.b[4] +; CHECK-NEXT: mov z16.b, z1.b[2] +; CHECK-NEXT: mov z18.b, z0.b[14] +; CHECK-NEXT: mov z19.b, z0.b[12] +; CHECK-NEXT: zip1 z3.b, z3.b, z2.b +; CHECK-NEXT: ldp q2, q17, [x1] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: zip1 z4.b, z5.b, z4.b +; CHECK-NEXT: zip1 z5.b, z7.b, z6.b +; CHECK-NEXT: zip1 z6.b, z1.b, z16.b +; CHECK-NEXT: mov z7.b, z0.b[8] +; CHECK-NEXT: mov z16.b, z0.b[6] +; CHECK-NEXT: mov z21.b, z0.b[4] +; CHECK-NEXT: mov z22.b, z0.b[2] +; CHECK-NEXT: mov z23.b, z17.b[14] +; CHECK-NEXT: mov z24.b, z17.b[12] +; CHECK-NEXT: mov z25.b, z17.b[10] +; CHECK-NEXT: mov z26.b, z17.b[8] +; CHECK-NEXT: mov z27.b, z17.b[6] +; CHECK-NEXT: mov z28.b, z17.b[4] +; CHECK-NEXT: mov z29.b, z17.b[2] +; CHECK-NEXT: zip1 z18.b, z19.b, z18.b +; CHECK-NEXT: zip1 z7.b, z7.b, z20.b +; CHECK-NEXT: zip1 z16.b, z21.b, z16.b +; CHECK-NEXT: zip1 z19.b, z0.b, z22.b +; CHECK-NEXT: zip1 z20.b, z24.b, z23.b +; CHECK-NEXT: zip1 z21.b, z26.b, z25.b +; CHECK-NEXT: zip1 z22.b, z28.b, z27.b +; CHECK-NEXT: mov z24.b, z2.b[14] +; CHECK-NEXT: mov z25.b, z2.b[12] +; CHECK-NEXT: mov z26.b, z2.b[10] +; CHECK-NEXT: mov z27.b, z2.b[8] +; CHECK-NEXT: zip1 z23.b, z17.b, z29.b +; CHECK-NEXT: zip1 z3.h, z4.h, z3.h +; CHECK-NEXT: zip1 z4.h, z6.h, z5.h +; CHECK-NEXT: zip1 z5.h, z7.h, z18.h +; CHECK-NEXT: zip1 z6.h, z19.h, z16.h +; CHECK-NEXT: zip1 z7.h, z21.h, z20.h +; CHECK-NEXT: zip1 z18.b, z25.b, z24.b +; CHECK-NEXT: zip1 z19.b, z27.b, z26.b +; CHECK-NEXT: mov z20.b, z2.b[6] +; CHECK-NEXT: mov z21.b, z2.b[4] +; CHECK-NEXT: mov z29.b, z17.b[3] +; CHECK-NEXT: mov z30.b, z17.b[1] +; CHECK-NEXT: mov z31.b, z2.b[15] +; CHECK-NEXT: mov z8.b, z2.b[13] +; CHECK-NEXT: zip1 z16.h, z23.h, z22.h +; CHECK-NEXT: mov z22.b, z2.b[2] +; CHECK-NEXT: mov z23.b, z17.b[15] +; CHECK-NEXT: mov z24.b, z17.b[13] +; CHECK-NEXT: mov z25.b, z17.b[11] +; CHECK-NEXT: mov z26.b, z17.b[9] +; CHECK-NEXT: mov z27.b, z17.b[7] +; CHECK-NEXT: mov z28.b, z17.b[5] +; CHECK-NEXT: zip1 z17.h, z19.h, z18.h +; CHECK-NEXT: zip1 z21.b, z21.b, z20.b +; CHECK-NEXT: zip1 z19.b, z30.b, z29.b +; CHECK-NEXT: zip1 z20.b, z8.b, z31.b +; CHECK-NEXT: mov z29.b, z1.b[15] +; CHECK-NEXT: mov z30.b, z1.b[13] +; CHECK-NEXT: mov z31.b, z1.b[11] +; CHECK-NEXT: mov z8.b, z1.b[9] +; CHECK-NEXT: zip1 z22.b, z2.b, z22.b +; CHECK-NEXT: zip1 z23.b, z24.b, z23.b +; CHECK-NEXT: zip1 z24.b, z26.b, z25.b +; CHECK-NEXT: zip1 z18.b, z28.b, z27.b +; CHECK-NEXT: mov z25.b, z2.b[11] +; CHECK-NEXT: mov z26.b, z2.b[9] +; CHECK-NEXT: mov z27.b, z2.b[7] +; CHECK-NEXT: mov z28.b, z2.b[5] +; CHECK-NEXT: mov z9.b, z1.b[7] +; CHECK-NEXT: mov z10.b, z1.b[5] +; CHECK-NEXT: mov z1.b, z1.b[3] +; CHECK-NEXT: mov z11.b, z0.b[11] +; CHECK-NEXT: mov z12.b, z0.b[9] +; CHECK-NEXT: zip1 z29.b, z30.b, z29.b +; CHECK-NEXT: mov z30.b, z0.b[3] +; CHECK-NEXT: mov z13.b, z0.b[1] +; CHECK-NEXT: zip1 z31.b, z8.b, z31.b +; CHECK-NEXT: mov z8.b, z2.b[3] ; CHECK-NEXT: mov z2.b, z2.b[1] -; CHECK-NEXT: strb w8, [sp, #54] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[15] -; CHECK-NEXT: strb w8, [sp, #52] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[13] -; CHECK-NEXT: strb w8, [sp, #50] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[11] -; CHECK-NEXT: strb w8, [sp, #49] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z1.b[9] -; CHECK-NEXT: strb w8, [sp, #48] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z1.b[7] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[15] -; CHECK-NEXT: strb w8, [sp, #31] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z1.b[5] -; CHECK-NEXT: strb w9, [sp, #28] -; CHECK-NEXT: strb w8, [sp, #30] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.b, z1.b[3] -; CHECK-NEXT: mov z1.b, z1.b[1] -; CHECK-NEXT: strb w8, [sp, #29] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[11] -; CHECK-NEXT: strb w8, [sp, #27] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[13] -; CHECK-NEXT: strb w8, [sp, #26] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strb w8, [sp, #25] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.b, z0.b[9] -; CHECK-NEXT: strb w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.b, z0.b[7] -; CHECK-NEXT: strb w8, [sp, #23] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.b, z0.b[5] -; CHECK-NEXT: strb w8, [sp, #22] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.b, z0.b[3] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: strb w8, [sp, #21] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [sp, #20] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #19] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strb w8, [sp, #18] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strb w8, [sp, #17] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.b, z4.b, z0.b -; CHECK-NEXT: strb w8, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.b, z5.b, z1.b +; CHECK-NEXT: zip1 z9.b, z10.b, z9.b +; CHECK-NEXT: zip1 z10.b, z12.b, z11.b +; CHECK-NEXT: zip1 z1.b, z0.b, z1.b +; CHECK-NEXT: zip1 z30.b, z13.b, z30.b +; CHECK-NEXT: mov z11.b, z0.b[13] +; CHECK-NEXT: mov z0.b, z0.b[5] +; CHECK-NEXT: zip1 z25.b, z26.b, z25.b +; CHECK-NEXT: zip1 z26.b, z28.b, z27.b +; CHECK-NEXT: zip1 z2.b, z2.b, z8.b +; CHECK-NEXT: zip1 z21.h, z22.h, z21.h +; CHECK-NEXT: zip1 z22.h, z24.h, z23.h +; CHECK-NEXT: zip1 z23.h, z31.h, z29.h +; CHECK-NEXT: zip1 z1.h, z1.h, z9.h +; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z24.h, z10.h, z11.h +; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: zip1 z0.h, z30.h, z0.h +; CHECK-NEXT: zip1 z18.h, z19.h, z18.h +; CHECK-NEXT: zip1 z19.h, z25.h, z20.h +; CHECK-NEXT: zip1 z2.h, z2.h, z26.h +; CHECK-NEXT: zip1 z3.s, z4.s, z3.s +; CHECK-NEXT: zip1 z4.s, z6.s, z5.s +; CHECK-NEXT: zip1 z5.s, z16.s, z7.s +; CHECK-NEXT: zip1 z1.s, z1.s, z23.s +; CHECK-NEXT: zip1 z6.s, z21.s, z17.s +; CHECK-NEXT: zip1 z0.s, z0.s, z24.s +; CHECK-NEXT: zip1 z7.s, z18.s, z22.s +; CHECK-NEXT: zip1 z2.s, z2.s, z19.s +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: zip1 z1.d, z6.d, z5.d +; CHECK-NEXT: zip1 z2.d, z2.d, z7.d +; CHECK-NEXT: add z0.b, z3.b, z0.b +; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ldp d13, d12, [sp], #48 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v32i8: @@ -1922,110 +1749,71 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: mov z4.h, z3.h[6] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z6.h, z3.h[2] -; CHECK-NEXT: mov z5.h, z3.h[4] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z7.h, z2.h[6] -; CHECK-NEXT: mov z17.h, z2.h[7] -; CHECK-NEXT: mov z16.h, z3.h[1] -; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[4] -; CHECK-NEXT: strh w9, [sp, #32] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z2.h[2] -; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z1.h[2] -; CHECK-NEXT: strh w9, [sp, #44] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: mov z7.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[6] -; CHECK-NEXT: strh w9, [sp, #38] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[4] -; CHECK-NEXT: strh w9, [sp, #56] -; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[4] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[2] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z3.h[7] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z3.h[5] -; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: ldr q3, [sp, #32] -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z2.h[5] -; CHECK-NEXT: ldr q4, [sp] -; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z1.h[7] -; CHECK-NEXT: strh w8, [sp, #60] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[1] -; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #48] -; CHECK-NEXT: strh w8, [sp, #52] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: ldp q1, q6, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.h, z6.h[6] +; CHECK-NEXT: mov z4.h, z6.h[4] +; CHECK-NEXT: mov z5.h, z6.h[2] +; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: mov z16.h, z1.h[4] +; CHECK-NEXT: mov z17.h, z1.h[2] +; CHECK-NEXT: mov z18.h, z2.h[6] +; CHECK-NEXT: mov z19.h, z2.h[4] +; CHECK-NEXT: mov z20.h, z2.h[2] +; CHECK-NEXT: mov z21.h, z0.h[6] +; CHECK-NEXT: mov z22.h, z0.h[4] +; CHECK-NEXT: zip1 z3.h, z4.h, z3.h +; CHECK-NEXT: zip1 z4.h, z6.h, z5.h +; CHECK-NEXT: zip1 z5.h, z16.h, z7.h +; CHECK-NEXT: zip1 z7.h, z1.h, z17.h +; CHECK-NEXT: zip1 z16.h, z19.h, z18.h +; CHECK-NEXT: zip1 z18.h, z2.h, z20.h +; CHECK-NEXT: mov z19.h, z0.h[2] +; CHECK-NEXT: zip1 z17.h, z22.h, z21.h +; CHECK-NEXT: mov z20.h, z6.h[7] +; CHECK-NEXT: mov z21.h, z6.h[5] +; CHECK-NEXT: mov z22.h, z6.h[3] +; CHECK-NEXT: mov z6.h, z6.h[1] +; CHECK-NEXT: mov z23.h, z1.h[7] +; CHECK-NEXT: mov z24.h, z1.h[5] +; CHECK-NEXT: mov z25.h, z1.h[3] ; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.h, z3.h, z0.h -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.h, z4.h, z1.h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: mov z26.h, z2.h[7] +; CHECK-NEXT: mov z27.h, z2.h[5] +; CHECK-NEXT: mov z28.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] +; CHECK-NEXT: mov z29.h, z0.h[7] +; CHECK-NEXT: mov z30.h, z0.h[5] +; CHECK-NEXT: mov z31.h, z0.h[3] +; CHECK-NEXT: mov z8.h, z0.h[1] +; CHECK-NEXT: zip1 z0.h, z0.h, z19.h +; CHECK-NEXT: zip1 z19.h, z21.h, z20.h +; CHECK-NEXT: zip1 z6.h, z6.h, z22.h +; CHECK-NEXT: zip1 z20.h, z24.h, z23.h +; CHECK-NEXT: zip1 z1.h, z1.h, z25.h +; CHECK-NEXT: zip1 z21.h, z27.h, z26.h +; CHECK-NEXT: zip1 z2.h, z2.h, z28.h +; CHECK-NEXT: zip1 z22.h, z30.h, z29.h +; CHECK-NEXT: zip1 z23.h, z8.h, z31.h +; CHECK-NEXT: zip1 z3.s, z4.s, z3.s +; CHECK-NEXT: zip1 z4.s, z7.s, z5.s +; CHECK-NEXT: zip1 z5.s, z18.s, z16.s +; CHECK-NEXT: zip1 z6.s, z6.s, z19.s +; CHECK-NEXT: zip1 z1.s, z1.s, z20.s +; CHECK-NEXT: zip1 z0.s, z0.s, z17.s +; CHECK-NEXT: zip1 z2.s, z2.s, z21.s +; CHECK-NEXT: zip1 z7.s, z23.s, z22.s +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z1.d, z1.d, z6.d +; CHECK-NEXT: zip1 z0.d, z0.d, z5.d +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d +; CHECK-NEXT: add z1.h, z3.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v16i16: @@ -2116,32 +1904,28 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q6, q0, [x0] ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ldp q1, q2, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z3.s, z0.s[2] -; CHECK-NEXT: mov z5.s, z1.s[2] -; CHECK-NEXT: stp s0, s3, [sp, #24] -; CHECK-NEXT: mov z3.s, z4.s[2] -; CHECK-NEXT: stp s5, s2, [sp, #12] -; CHECK-NEXT: mov z5.s, z0.s[3] -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: stp s3, s1, [sp, #4] -; CHECK-NEXT: mov z1.s, z2.s[1] -; CHECK-NEXT: str s5, [sp, #44] +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: mov z5.s, z0.s[1] +; CHECK-NEXT: mov z7.s, z2.s[2] +; CHECK-NEXT: mov z16.s, z1.s[2] +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: mov z4.s, z6.s[1] +; CHECK-NEXT: zip1 z2.s, z2.s, z7.s ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: str s0, [sp, #40] -; CHECK-NEXT: ldp q3, q2, [sp] -; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s -; CHECK-NEXT: str s1, [sp, #32] -; CHECK-NEXT: ldr q1, [sp, #32] -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: zip1 z7.s, z0.s, z16.s +; CHECK-NEXT: tbl z1.s, { z1.s }, z5.s +; CHECK-NEXT: zip1 z0.d, z6.d, z0.d +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s -; CHECK-NEXT: stp q1, q0, [x0] -; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8f32: @@ -2231,60 +2015,38 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z2.h, z1.h[6] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: mov z6.h, z0.h[4] -; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z5.h, z0.h[6] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: strh w9, [sp] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z1.h[7] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #4] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z0.h[7] -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[5] -; CHECK-NEXT: strh w8, [sp, #28] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #18] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: ldp q3, q0, [sp] -; CHECK-NEXT: add z0.h, z3.h, z0.h +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z16.h, z0.h[7] +; CHECK-NEXT: mov z17.h, z0.h[5] +; CHECK-NEXT: mov z18.h, z0.h[3] +; CHECK-NEXT: mov z19.h, z0.h[1] +; CHECK-NEXT: mov z20.h, z1.h[7] +; CHECK-NEXT: mov z21.h, z1.h[5] +; CHECK-NEXT: mov z22.h, z1.h[3] +; CHECK-NEXT: mov z23.h, z1.h[1] +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z0.h, z0.h, z4.h +; CHECK-NEXT: zip1 z3.h, z6.h, z5.h +; CHECK-NEXT: zip1 z1.h, z1.h, z7.h +; CHECK-NEXT: zip1 z4.h, z17.h, z16.h +; CHECK-NEXT: zip1 z5.h, z19.h, z18.h +; CHECK-NEXT: zip1 z6.h, z21.h, z20.h +; CHECK-NEXT: zip1 z7.h, z23.h, z22.h +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z2.s, z5.s, z4.s +; CHECK-NEXT: zip1 z3.s, z7.s, z6.s +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8i16: @@ -2341,31 +2103,21 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ define void @uzp_v8i32_undef(ptr %a) #0{ ; CHECK-LABEL: uzp_v8i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.s, z0.s[2] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z3.s, z1.s[2] -; CHECK-NEXT: mov z4.s, z0.s[3] -; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.s, z1.s[3] -; CHECK-NEXT: stp w8, w9, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w9, w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: stp w9, w8, [sp, #16] -; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: mov z4.s, z1.s[3] +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: mov z7.s, z0.s[1] +; CHECK-NEXT: zip1 z1.s, z1.s, z2.s +; CHECK-NEXT: zip1 z0.s, z0.s, z3.s +; CHECK-NEXT: zip1 z2.s, z5.s, z4.s +; CHECK-NEXT: zip1 z3.s, z7.s, z6.s +; CHECK-NEXT: zip1 z0.d, z0.d, z1.d +; CHECK-NEXT: zip1 z1.d, z3.d, z2.d ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: stp q0, q0, [x0] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: uzp_v8i32_undef: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll index 88c83a214c73..c942f1eca8eb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll @@ -10,22 +10,14 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z1.s, z0.s[3] -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z2.s, z0.s[2] ; CHECK-NEXT: mov z3.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: zip1 z1.h, z2.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1 -- GitLab From 7b65971e1f64e0736da31decae495e25db3ac773 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 18 Oct 2024 10:35:56 +0100 Subject: [PATCH 027/511] InstCombine: sink loads with invariant.load metadata (#112692) --- .../InstCombine/InstructionCombining.cpp | 3 +- .../InstCombine/sink_instruction.ll | 119 +++++++++++++++++- 2 files changed, 117 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 954c4cf19c20..c8b9f166b160 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -4822,7 +4822,8 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. - if (I->mayReadFromMemory()) { + if (I->mayReadFromMemory() && + !I->hasMetadata(LLVMContext::MD_invariant_load)) { // We don't want to do any sophisticated alias analysis, so we only check // the instructions after I in I's parent block if we try to sink to its // successor block. diff --git a/llvm/test/Transforms/InstCombine/sink_instruction.ll b/llvm/test/Transforms/InstCombine/sink_instruction.ll index c938002788bc..dac40852c4bd 100644 --- a/llvm/test/Transforms/InstCombine/sink_instruction.ll +++ b/llvm/test/Transforms/InstCombine/sink_instruction.ll @@ -86,8 +86,8 @@ define i32 @test3(ptr nocapture readonly %P, i32 %i) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i32 [[I:%.*]], label [[SW_EPILOG:%.*]] [ -; CHECK-NEXT: i32 5, label [[SW_BB:%.*]] -; CHECK-NEXT: i32 2, label [[SW_BB]] +; CHECK-NEXT: i32 5, label [[SW_BB:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 @@ -190,8 +190,8 @@ define i32 @test6(ptr nocapture readonly %P, i32 %i, i1 %cond) { ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: switch i32 [[I]], label [[SW_BB:%.*]] [ -; CHECK-NEXT: i32 5, label [[SW_EPILOG:%.*]] -; CHECK-NEXT: i32 2, label [[SW_EPILOG]] +; CHECK-NEXT: i32 5, label [[SW_EPILOG:%.*]] +; CHECK-NEXT: i32 2, label [[SW_EPILOG]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: br label [[SW_EPILOG]] @@ -272,3 +272,114 @@ abort: call void @abort() unreachable } + +; Loads marked invariant can be sunk past potential memory writes. + +define i32 @invariant_load_metadata(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_metadata( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[P:%.*]], align 4, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: ret i32 [[V]] +; +entry: + %v = load i32, ptr %p, !invariant.load !0 + br i1 %cond, label %block, label %end +block: + call void @fn() + br label %end +end: + ret i32 %v +} + +; Loads not marked invariant cannot be sunk past potential memory writes. + +define i32 @invariant_load_neg(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_neg( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret i32 [[V]] +; +entry: + %v = load i32, ptr %p + br i1 %cond, label %block, label %end +block: + call void @fn() + br label %end +end: + ret i32 %v +} + +; Loads that aren't marked invariant but used in one branch +; can be sunk to that branch. + +define void @invariant_load_use_in_br(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_use_in_br( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE_BR:%.*]], label [[FALSE_BR:%.*]] +; CHECK: true.br: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: false.br: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: call void @fn(i32 [[VAL]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %val = load i32, ptr %p + br i1 %cond, label %true.br, label %false.br +true.br: + call void @fn() + br label %exit +false.br: + call void @fn(i32 %val) + br label %exit +exit: + ret void +} + +; Invariant loads marked with metadata can be sunk past calls. + +define void @invariant_load_metadata_call(ptr %p, i1 %cond) { +; CHECK-LABEL: @invariant_load_metadata_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE_BR:%.*]], label [[FALSE_BR:%.*]] +; CHECK: true.br: +; CHECK-NEXT: call void @fn() +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: false.br: +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: call void @fn(i32 [[VAL]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %val = load i32, ptr %p, !invariant.load !0 + call void @fn() + br i1 %cond, label %true.br, label %false.br +true.br: + call void @fn() + br label %exit +false.br: + call void @fn(i32 %val) + br label %exit +exit: + ret void +} + +declare void @fn() + +!0 = !{} -- GitLab From 228f88fdc8e92789e0562f8a47493493da843145 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 18 Oct 2024 11:40:38 +0200 Subject: [PATCH 028/511] [RISCV] Inline Assembly: RVC constraint and N modifier (#112561) This change implements support for the `cr` and `cf` register constraints (which allocate a RVC GPR or RVC FPR respectively), and the `N` modifier (which prints the raw encoding of a register rather than the name). The intention behind these additions is to make it easier to use inline assembly when assembling raw instructions that are not supported by the compiler, for instance when experimenting with new instructions or when supporting proprietary extensions outside the toolchain. These implement part of my proposal in riscv-non-isa/riscv-c-api-doc#92 As part of the implementation, I felt there was not enough coverage of inline assembly and the "in X" floating-point extensions, so I have added more regression tests around these configurations. --- clang/lib/Basic/Targets/RISCV.cpp | 10 ++ clang/test/CodeGen/RISCV/riscv-inline-asm.c | 40 ++++- llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 8 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 18 ++ llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 19 ++- .../RISCV/inline-asm-d-constraint-f.ll | 33 ++++ .../CodeGen/RISCV/inline-asm-d-modifier-N.ll | 109 ++++++++++++ .../RISCV/inline-asm-f-constraint-f.ll | 28 +++- .../CodeGen/RISCV/inline-asm-f-modifier-N.ll | 96 +++++++++++ llvm/test/CodeGen/RISCV/inline-asm-invalid.ll | 20 +++ .../RISCV/inline-asm-zdinx-constraint-r.ll | 92 ++++++++++ .../RISCV/inline-asm-zfh-constraint-f.ll | 41 +++++ .../RISCV/inline-asm-zfh-modifier-N.ll | 157 +++++++++++++++++ .../RISCV/inline-asm-zfinx-constraint-r.ll | 89 ++++++++++ .../RISCV/inline-asm-zhinx-constraint-r.ll | 158 ++++++++++++++++++ llvm/test/CodeGen/RISCV/inline-asm.ll | 66 ++++++++ .../CodeGen/RISCV/zdinx-asm-constraint.ll | 61 +++++++ 17 files changed, 1040 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-zfh-modifier-N.ll create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll create mode 100644 llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index 870f0f38bc30..eaaba7642bd7 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -100,6 +100,14 @@ bool RISCVTargetInfo::validateAsmConstraint( case 'S': // A symbol or label reference with a constant offset Info.setAllowsRegister(); return true; + case 'c': + // A RVC register - GPR or FPR + if (Name[1] == 'r' || Name[1] == 'f') { + Info.setAllowsRegister(); + Name += 1; + return true; + } + return false; case 'v': // A vector register. if (Name[1] == 'r' || Name[1] == 'd' || Name[1] == 'm') { @@ -114,6 +122,8 @@ bool RISCVTargetInfo::validateAsmConstraint( std::string RISCVTargetInfo::convertConstraint(const char *&Constraint) const { std::string R; switch (*Constraint) { + // c* and v* are two-letter constraints on RISC-V. + case 'c': case 'v': R = std::string("^") + std::string(Constraint, 2); Constraint += 1; diff --git a/clang/test/CodeGen/RISCV/riscv-inline-asm.c b/clang/test/CodeGen/RISCV/riscv-inline-asm.c index fa0bf6aa6aa4..75b91d3c497c 100644 --- a/clang/test/CodeGen/RISCV/riscv-inline-asm.c +++ b/clang/test/CodeGen/RISCV/riscv-inline-asm.c @@ -3,7 +3,35 @@ // RUN: %clang_cc1 -triple riscv64 -O2 -emit-llvm %s -o - \ // RUN: | FileCheck %s -// Test RISC-V specific inline assembly constraints. +// Test RISC-V specific inline assembly constraints and modifiers. + +long test_r(long x) { +// CHECK-LABEL: define{{.*}} {{i64|i32}} @test_r( +// CHECK: call {{i64|i32}} asm sideeffect "", "=r,r"({{i64|i32}} %{{.*}}) + long ret; + asm volatile ("" : "=r"(ret) : "r"(x)); +// CHECK: call {{i64|i32}} asm sideeffect "", "=r,r"({{i64|i32}} %{{.*}}) + asm volatile ("" : "=r"(ret) : "r"(x)); + return ret; +} + +long test_cr(long x) { +// CHECK-LABEL: define{{.*}} {{i64|i32}} @test_cr( +// CHECK: call {{i64|i32}} asm sideeffect "", "=^cr,^cr"({{i64|i32}} %{{.*}}) + long ret; + asm volatile ("" : "=cr"(ret) : "cr"(x)); + return ret; +} + +float cf; +double cd; +void test_cf(float f, double d) { +// CHECK-LABEL: define{{.*}} void @test_cf( +// CHECK: call float asm sideeffect "", "=^cf,^cf"(float %{{.*}}) + asm volatile("" : "=cf"(cf) : "cf"(f)); +// CHECK: call double asm sideeffect "", "=^cf,^cf"(double %{{.*}}) + asm volatile("" : "=cf"(cd) : "cf"(d)); +} void test_I(void) { // CHECK-LABEL: define{{.*}} void @test_I() @@ -58,3 +86,13 @@ void test_s(void) { asm("// %0 %1 %2" :: "S"(&var), "S"(&arr[1][1]), "S"(test_s)); } + +// CHECK-LABEL: test_modifiers( +// CHECK: call void asm sideeffect "// ${0:i} ${1:i}", "r,r"({{i32|i64}} %val, i32 37) +// CHECK: call void asm sideeffect "// ${0:z} ${1:z}", "i,i"(i32 0, i32 1) +// CHECK: call void asm sideeffect "// ${0:N}", "r"({{i32|i64}} %val) +void test_modifiers(long val) { + asm volatile("// %i0 %i1" :: "r"(val), "r"(37)); + asm volatile("// %z0 %z1" :: "i"(0), "i"(1)); + asm volatile("// %N0" :: "r"(val)); +} diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 5ad09ae7290f..5eba36a0bb7d 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -19,6 +19,7 @@ #include "RISCV.h" #include "RISCVConstantPoolValue.h" #include "RISCVMachineFunctionInfo.h" +#include "RISCVRegisterInfo.h" #include "RISCVTargetMachine.h" #include "TargetInfo/RISCVTargetInfo.h" #include "llvm/ADT/APInt.h" @@ -348,6 +349,13 @@ bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, if (!MO.isReg()) OS << 'i'; return false; + case 'N': // Print the register encoding as an integer (0-31) + if (!MO.isReg()) + return true; + + const RISCVRegisterInfo *TRI = STI->getRegisterInfo(); + OS << TRI->getEncodingValue(MO.getReg()); + return false; } } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fa157ca48db2..60ac58f824ed 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20366,6 +20366,8 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const { } else { if (Constraint == "vr" || Constraint == "vd" || Constraint == "vm") return C_RegisterClass; + if (Constraint == "cr" || Constraint == "cf") + return C_RegisterClass; } return TargetLowering::getConstraintType(Constraint); } @@ -20428,6 +20430,22 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } else if (Constraint == "vm") { if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy)) return std::make_pair(0U, &RISCV::VMV0RegClass); + } else if (Constraint == "cr") { + if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) + return std::make_pair(0U, &RISCV::GPRF16CRegClass); + if (VT == MVT::f32 && Subtarget.hasStdExtZfinx()) + return std::make_pair(0U, &RISCV::GPRF32CRegClass); + if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) + return std::make_pair(0U, &RISCV::GPRPairCRegClass); + if (!VT.isVector()) + return std::make_pair(0U, &RISCV::GPRCRegClass); + } else if (Constraint == "cf") { + if (Subtarget.hasStdExtZfhmin() && VT == MVT::f16) + return std::make_pair(0U, &RISCV::FPR16CRegClass); + if (Subtarget.hasStdExtF() && VT == MVT::f32) + return std::make_pair(0U, &RISCV::FPR32CRegClass); + if (Subtarget.hasStdExtD() && VT == MVT::f64) + return std::make_pair(0U, &RISCV::FPR64CRegClass); } // Clang will correctly decode the usage of register name aliases into their diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 33363aa8b718..250f3c10f309 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -338,6 +338,11 @@ def FPR16 : RISCVRegisterClass<[f16, bf16], 16, (add (sequence "F%u_H", 18, 27) // fs2-fs11 )>; +def FPR16C : RISCVRegisterClass<[f16, bf16], 16, (add + (sequence "F%u_H", 15, 10), + (sequence "F%u_H", 8, 9) +)>; + def FPR32 : RISCVRegisterClass<[f32], 32, (add (sequence "F%u_F", 15, 10), (sequence "F%u_F", 0, 7), @@ -667,6 +672,10 @@ def GPRF32C : RISCVRegisterClass<[f32], 32, (add (sequence "X%u_W", 10, 15), (sequence "X%u_W", 8, 9))>; def GPRF32NoX0 : RISCVRegisterClass<[f32], 32, (sub GPRF32, X0_W)>; +def XLenPairRI : RegInfoByHwMode< + [RV32, RV64], + [RegInfo<64, 64, 32>, RegInfo<128, 128, 64>]>; + // Dummy zero register for use in the register pair containing X0 (as X1 is // not read to or written when the X0 register pair is used). def DUMMY_REG_PAIR_WITH_X0 : RISCVReg<0, "0">; @@ -698,9 +707,8 @@ let RegAltNameIndices = [ABIRegAltName] in { } } -let RegInfos = RegInfoByHwMode<[RV32, RV64], - [RegInfo<64, 64, 32>, RegInfo<128, 128, 64>]>, - DecoderMethod = "DecodeGPRPairRegisterClass" in +let RegInfos = XLenPairRI, + DecoderMethod = "DecodeGPRPairRegisterClass" in { def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add X10_X11, X12_X13, X14_X15, X16_X17, X6_X7, @@ -710,6 +718,11 @@ def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add X0_Pair, X2_X3, X4_X5 )>; +def GPRPairC : RISCVRegisterClass<[XLenPairFVT], 64, (add + X10_X11, X12_X13, X14_X15, X8_X9 +)>; +} // let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass" + // The register class is added for inline assembly for vector mask types. def VM : VReg; diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll index c480ba800c69..08e917365820 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll @@ -39,6 +39,39 @@ define double @constraint_f_double(double %a) nounwind { ret double %2 } +define double @constraint_cf_double(double %a) nounwind { +; RV32F-LABEL: constraint_cf_double: +; RV32F: # %bb.0: +; RV32F-NEXT: addi sp, sp, -16 +; RV32F-NEXT: sw a0, 8(sp) +; RV32F-NEXT: sw a1, 12(sp) +; RV32F-NEXT: fld fa5, 8(sp) +; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa4, %lo(gd)(a0) +; RV32F-NEXT: #APP +; RV32F-NEXT: fadd.d fa5, fa5, fa4 +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fsd fa5, 8(sp) +; RV32F-NEXT: lw a0, 8(sp) +; RV32F-NEXT: lw a1, 12(sp) +; RV32F-NEXT: addi sp, sp, 16 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_cf_double: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gd) +; RV64F-NEXT: fld fa5, %lo(gd)(a1) +; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: fadd.d fa5, fa4, fa5 +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.d a0, fa5 +; RV64F-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "=^cf,^cf,^cf"(double %a, double %1) + ret double %2 +} + define double @constraint_f_double_abi_name(double %a) nounwind { ; RV32F-LABEL: constraint_f_double_abi_name: ; RV32F: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll new file mode 100644 index 000000000000..581cf8e3bf3c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d -target-abi=ilp32 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV32F %s +; RUN: llc -mtriple=riscv64 -mattr=+d -target-abi=lp64 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV64F %s + +;; `.insn 0x4, 0x02000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)` is +;; the raw encoding for `fadd.d` + +@gd = external global double + +define double @constraint_f_double(double %a) nounwind { +; RV32F-LABEL: constraint_f_double: +; RV32F: # %bb.0: +; RV32F-NEXT: addi sp, sp, -16 +; RV32F-NEXT: sw a0, 8(sp) +; RV32F-NEXT: sw a1, 12(sp) +; RV32F-NEXT: fld fa5, 8(sp) +; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa4, %lo(gd)(a0) +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fsd fa5, 8(sp) +; RV32F-NEXT: lw a0, 8(sp) +; RV32F-NEXT: lw a1, 12(sp) +; RV32F-NEXT: addi sp, sp, 16 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_f_double: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gd) +; RV64F-NEXT: fld fa5, %lo(gd)(a1) +; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.d a0, fa5 +; RV64F-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm ".insn 0x4, 0x02000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=f,f,f"(double %a, double %1) + ret double %2 +} + +define double @constraint_cf_double(double %a) nounwind { +; RV32F-LABEL: constraint_cf_double: +; RV32F: # %bb.0: +; RV32F-NEXT: addi sp, sp, -16 +; RV32F-NEXT: sw a0, 8(sp) +; RV32F-NEXT: sw a1, 12(sp) +; RV32F-NEXT: fld fa5, 8(sp) +; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fa4, %lo(gd)(a0) +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fsd fa5, 8(sp) +; RV32F-NEXT: lw a0, 8(sp) +; RV32F-NEXT: lw a1, 12(sp) +; RV32F-NEXT: addi sp, sp, 16 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_cf_double: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gd) +; RV64F-NEXT: fld fa5, %lo(gd)(a1) +; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.d a0, fa5 +; RV64F-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm ".insn 0x4, 0x02000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=^cf,^cf,^cf"(double %a, double %1) + ret double %2 +} + +define double @constraint_f_double_abi_name(double %a) nounwind { +; RV32F-LABEL: constraint_f_double_abi_name: +; RV32F: # %bb.0: +; RV32F-NEXT: addi sp, sp, -16 +; RV32F-NEXT: sw a0, 8(sp) +; RV32F-NEXT: sw a1, 12(sp) +; RV32F-NEXT: fld fa1, 8(sp) +; RV32F-NEXT: lui a0, %hi(gd) +; RV32F-NEXT: fld fs0, %lo(gd)(a0) +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x02000053 | (0 << 7) | (11 << 15) | (8 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fsd ft0, 8(sp) +; RV32F-NEXT: lw a0, 8(sp) +; RV32F-NEXT: lw a1, 12(sp) +; RV32F-NEXT: addi sp, sp, 16 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_f_double_abi_name: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gd) +; RV64F-NEXT: fld fs0, %lo(gd)(a1) +; RV64F-NEXT: fmv.d.x fa1, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x02000053 | (0 << 7) | (11 << 15) | (8 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.d a0, ft0 +; RV64F-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm ".insn 0x4, 0x02000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "={ft0},{fa1},{fs0}"(double %a, double %1) + ret double %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll index 91922cd236df..a91c6544f9e2 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions gave been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+f -target-abi=ilp32 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32F %s ; RUN: llc -mtriple=riscv64 -mattr=+f -target-abi=lp64 -verify-machineinstrs < %s \ @@ -38,6 +37,33 @@ define float @constraint_f_float(float %a) nounwind { ret float %2 } +define float @constraint_cf_float(float %a) nounwind { +; RV32F-LABEL: constraint_cf_float: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a1, %hi(gf) +; RV32F-NEXT: flw fa5, %lo(gf)(a1) +; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: #APP +; RV32F-NEXT: fadd.s fa5, fa4, fa5 +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fmv.x.w a0, fa5 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_cf_float: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gf) +; RV64F-NEXT: flw fa5, %lo(gf)(a1) +; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: fadd.s fa5, fa4, fa5 +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.w a0, fa5 +; RV64F-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "=^cf,cf,cf"(float %a, float %1) + ret float %2 +} + define float @constraint_f_float_abi_name(float %a) nounwind { ; RV32F-LABEL: constraint_f_float_abi_name: ; RV32F: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll new file mode 100644 index 000000000000..a0de5c71a7df --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+f -target-abi=ilp32 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV32F %s +; RUN: llc -mtriple=riscv64 -mattr=+f -target-abi=lp64 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV64F %s +; RUN: llc -mtriple=riscv32 -mattr=+d -target-abi=ilp32 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV32F %s +; RUN: llc -mtriple=riscv64 -mattr=+d -target-abi=lp64 -verify-machineinstrs -no-integrated-as < %s \ +; RUN: | FileCheck -check-prefix=RV64F %s + +;; `.insn 0x4, 0x53 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)` is +;; the raw encoding for `fadd.s` + +@gf = external global float + +define float @constraint_f_modifier_N_float(float %a) nounwind { +; RV32F-LABEL: constraint_f_modifier_N_float: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a1, %hi(gf) +; RV32F-NEXT: flw fa5, %lo(gf)(a1) +; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fmv.x.w a0, fa5 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_f_modifier_N_float: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gf) +; RV64F-NEXT: flw fa5, %lo(gf)(a1) +; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.w a0, fa5 +; RV64F-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm ".insn 0x4, 0x53 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=f,f,f"(float %a, float %1) + ret float %2 +} + + +define float @constraint_cf_modifier_N_float(float %a) nounwind { +; RV32F-LABEL: constraint_cf_modifier_N_float: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a1, %hi(gf) +; RV32F-NEXT: flw fa5, %lo(gf)(a1) +; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fmv.x.w a0, fa5 +; RV32F-NEXT: ret +; +; RV64F-LABEL: constraint_cf_modifier_N_float: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gf) +; RV64F-NEXT: flw fa5, %lo(gf)(a1) +; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.w a0, fa5 +; RV64F-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm ".insn 0x4, 0x53 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=^cf,^cf,^cf"(float %a, float %1) + ret float %2 +} + +define float @modifier_N_float_abi_name(float %a) nounwind { +; RV32F-LABEL: modifier_N_float_abi_name: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a1, %hi(gf) +; RV32F-NEXT: flw fs0, %lo(gf)(a1) +; RV32F-NEXT: fmv.w.x fa0, a0 +; RV32F-NEXT: #APP +; RV32F-NEXT: .insn 0x4, 0x53 | (0 << 7) | (10 << 15) | (8 << 20) +; RV32F-NEXT: #NO_APP +; RV32F-NEXT: fmv.x.w a0, ft0 +; RV32F-NEXT: ret +; +; RV64F-LABEL: modifier_N_float_abi_name: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a1, %hi(gf) +; RV64F-NEXT: flw fs0, %lo(gf)(a1) +; RV64F-NEXT: fmv.w.x fa0, a0 +; RV64F-NEXT: #APP +; RV64F-NEXT: .insn 0x4, 0x53 | (0 << 7) | (10 << 15) | (8 << 20) +; RV64F-NEXT: #NO_APP +; RV64F-NEXT: fmv.x.w a0, ft0 +; RV64F-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm ".insn 0x4, 0x53 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "={ft0},{fa0},{fs0}"(float %a, float %1) + ret float %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-invalid.ll b/llvm/test/CodeGen/RISCV/inline-asm-invalid.ll index 14b7cb896674..deffa177c5e6 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-invalid.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-invalid.ll @@ -31,6 +31,14 @@ define void @constraint_f() nounwind { ret void } +define void @constraint_cf() nounwind { +; CHECK: error: couldn't allocate input reg for constraint 'cf' + tail call void asm "fadd.s fa0, fa0, $0", "^cf"(float 0.0) +; CHECK: error: couldn't allocate input reg for constraint 'cf' + tail call void asm "fadd.d fa0, fa0, $0", "^cf"(double 0.0) + ret void +} + define void @constraint_r_fixed_vec() nounwind { ; CHECK: error: couldn't allocate input reg for constraint 'r' tail call void asm "add a0, a0, $0", "r"(<4 x i32> zeroinitializer) @@ -42,3 +50,15 @@ define void @constraint_r_scalable_vec() nounwind { tail call void asm "add a0, a0, $0", "r"( zeroinitializer) ret void } + +define void @constraint_cr_fixed_vec() nounwind { +; CHECK: error: couldn't allocate input reg for constraint 'cr' + tail call void asm "add a0, a0, $0", "^cr"(<4 x i32> zeroinitializer) + ret void +} + +define void @constraint_cr_scalable_vec() nounwind { +; CHECK: error: couldn't allocate input reg for constraint 'cr' + tail call void asm "add a0, a0, $0", "^cr"( zeroinitializer) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll new file mode 100644 index 000000000000..15729ee2bc61 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+zdinx -target-abi=ilp32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32FINX %s +; RUN: llc -mtriple=riscv64 -mattr=+zdinx -target-abi=lp64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64FINX %s + +@gd = external global double + +define double @constraint_r_double(double %a) nounwind { +; RV32FINX-LABEL: constraint_r_double: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a2, %hi(gd) +; RV32FINX-NEXT: lw a3, %lo(gd+4)(a2) +; RV32FINX-NEXT: lw a2, %lo(gd)(a2) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.d a0, a0, a2 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_r_double: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gd) +; RV64FINX-NEXT: ld a1, %lo(gd)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.d a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "=r,r,r"(double %a, double %1) + ret double %2 +} + +define double @constraint_cr_double(double %a) nounwind { +; RV32FINX-LABEL: constraint_cr_double: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a2, %hi(gd) +; RV32FINX-NEXT: lw a3, %lo(gd+4)(a2) +; RV32FINX-NEXT: lw a2, %lo(gd)(a2) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.d a0, a0, a2 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_cr_double: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gd) +; RV64FINX-NEXT: ld a1, %lo(gd)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.d a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "=^cr,^cr,^cr"(double %a, double %1) + ret double %2 +} + +define double @constraint_double_abi_name(double %a) nounwind { +; RV32FINX-LABEL: constraint_double_abi_name: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: addi sp, sp, -16 +; RV32FINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32FINX-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32FINX-NEXT: lui a2, %hi(gd) +; RV32FINX-NEXT: lw s0, %lo(gd)(a2) +; RV32FINX-NEXT: lw s1, %lo(gd+4)(a2) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.d t1, a0, s0 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: mv a0, t1 +; RV32FINX-NEXT: mv a1, t2 +; RV32FINX-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32FINX-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32FINX-NEXT: addi sp, sp, 16 +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_double_abi_name: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: addi sp, sp, -16 +; RV64FINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64FINX-NEXT: lui a1, %hi(gd) +; RV64FINX-NEXT: ld s0, %lo(gd)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.d t1, a0, s0 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: mv a0, t1 +; RV64FINX-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64FINX-NEXT: addi sp, sp, 16 +; RV64FINX-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "={t1},{a0},{s0}"(double %a, double %1) + ret double %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfh-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfh-constraint-f.ll index 8caf5956e7a7..83145ba69673 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zfh-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zfh-constraint-f.ll @@ -51,6 +51,47 @@ define half @constraint_f_half(half %a) nounwind { ret half %2 } +define half @constraint_cf_half(half %a) nounwind { +; RV32ZFH-LABEL: constraint_cf_half: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(gh) +; RV32ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32ZFH-NEXT: #APP +; RV32ZFH-NEXT: fadd.h fa0, fa0, fa5 +; RV32ZFH-NEXT: #NO_APP +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: constraint_cf_half: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: lui a0, %hi(gh) +; RV64ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64ZFH-NEXT: #APP +; RV64ZFH-NEXT: fadd.h fa0, fa0, fa5 +; RV64ZFH-NEXT: #NO_APP +; RV64ZFH-NEXT: ret +; +; RV32DZFH-LABEL: constraint_cf_half: +; RV32DZFH: # %bb.0: +; RV32DZFH-NEXT: lui a0, %hi(gh) +; RV32DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32DZFH-NEXT: #APP +; RV32DZFH-NEXT: fadd.h fa0, fa0, fa5 +; RV32DZFH-NEXT: #NO_APP +; RV32DZFH-NEXT: ret +; +; RV64DZFH-LABEL: constraint_cf_half: +; RV64DZFH: # %bb.0: +; RV64DZFH-NEXT: lui a0, %hi(gh) +; RV64DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64DZFH-NEXT: #APP +; RV64DZFH-NEXT: fadd.h fa0, fa0, fa5 +; RV64DZFH-NEXT: #NO_APP +; RV64DZFH-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.h $0, $1, $2", "=^cf,^cf,^cf"(half %a, half %1) + ret half %2 +} + define half @constraint_f_half_abi_name(half %a) nounwind { ; RV32ZFH-LABEL: constraint_f_half_abi_name: ; RV32ZFH: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfh-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfh-modifier-N.ll new file mode 100644 index 000000000000..d1eb2a2d8b10 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-zfh-modifier-N.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=zfh -verify-machineinstrs -no-integrated-as < %s \ +; RUN: -target-abi=ilp32f | FileCheck -check-prefix=RV32ZFH %s +; RUN: llc -mtriple=riscv64 -mattr=zfh -verify-machineinstrs -no-integrated-as < %s \ +; RUN: -target-abi=lp64f | FileCheck -check-prefix=RV64ZFH %s +; RUN: llc -mtriple=riscv32 -mattr=zfh,+d -verify-machineinstrs -no-integrated-as < %s \ +; RUN: -target-abi=ilp32d | FileCheck -check-prefix=RV32DZFH %s +; RUN: llc -mtriple=riscv64 -mattr=zfh,+d -verify-machineinstrs -no-integrated-as < %s \ +; RUN: -target-abi=lp64d | FileCheck -check-prefix=RV64DZFH %s + +;; `.insn 0x4, 0x04000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)` is +;; the raw encoding for `fadd.h` + +@gh = external global half + +define half @constraint_f_half(half %a) nounwind { +; RV32ZFH-LABEL: constraint_f_half: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(gh) +; RV32ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32ZFH-NEXT: #APP +; RV32ZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV32ZFH-NEXT: #NO_APP +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: constraint_f_half: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: lui a0, %hi(gh) +; RV64ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64ZFH-NEXT: #APP +; RV64ZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV64ZFH-NEXT: #NO_APP +; RV64ZFH-NEXT: ret +; +; RV32DZFH-LABEL: constraint_f_half: +; RV32DZFH: # %bb.0: +; RV32DZFH-NEXT: lui a0, %hi(gh) +; RV32DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32DZFH-NEXT: #APP +; RV32DZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV32DZFH-NEXT: #NO_APP +; RV32DZFH-NEXT: ret +; +; RV64DZFH-LABEL: constraint_f_half: +; RV64DZFH: # %bb.0: +; RV64DZFH-NEXT: lui a0, %hi(gh) +; RV64DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64DZFH-NEXT: #APP +; RV64DZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV64DZFH-NEXT: #NO_APP +; RV64DZFH-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm ".insn 0x4, 0x04000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=f,f,f"(half %a, half %1) + ret half %2 +} + +define half @constraint_cf_half(half %a) nounwind { +; RV32ZFH-LABEL: constraint_cf_half: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(gh) +; RV32ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32ZFH-NEXT: #APP +; RV32ZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV32ZFH-NEXT: #NO_APP +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: constraint_cf_half: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: lui a0, %hi(gh) +; RV64ZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64ZFH-NEXT: #APP +; RV64ZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV64ZFH-NEXT: #NO_APP +; RV64ZFH-NEXT: ret +; +; RV32DZFH-LABEL: constraint_cf_half: +; RV32DZFH: # %bb.0: +; RV32DZFH-NEXT: lui a0, %hi(gh) +; RV32DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV32DZFH-NEXT: #APP +; RV32DZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV32DZFH-NEXT: #NO_APP +; RV32DZFH-NEXT: ret +; +; RV64DZFH-LABEL: constraint_cf_half: +; RV64DZFH: # %bb.0: +; RV64DZFH-NEXT: lui a0, %hi(gh) +; RV64DZFH-NEXT: flh fa5, %lo(gh)(a0) +; RV64DZFH-NEXT: #APP +; RV64DZFH-NEXT: .insn 0x4, 0x04000053 | (10 << 7) | (10 << 15) | (15 << 20) +; RV64DZFH-NEXT: #NO_APP +; RV64DZFH-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm ".insn 0x4, 0x04000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=^cf,^cf,^cf"(half %a, half %1) + ret half %2 +} + +define half @constraint_f_half_abi_name(half %a) nounwind { +; RV32ZFH-LABEL: constraint_f_half_abi_name: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: addi sp, sp, -16 +; RV32ZFH-NEXT: fsw fs0, 12(sp) # 4-byte Folded Spill +; RV32ZFH-NEXT: lui a0, %hi(gh) +; RV32ZFH-NEXT: flh fs0, %lo(gh)(a0) +; RV32ZFH-NEXT: #APP +; RV32ZFH-NEXT: .insn 0x4, 0x04000053 | (0 << 7) | (10 << 15) | (8 << 20) +; RV32ZFH-NEXT: #NO_APP +; RV32ZFH-NEXT: fmv.h fa0, ft0 +; RV32ZFH-NEXT: flw fs0, 12(sp) # 4-byte Folded Reload +; RV32ZFH-NEXT: addi sp, sp, 16 +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: constraint_f_half_abi_name: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: addi sp, sp, -16 +; RV64ZFH-NEXT: fsw fs0, 12(sp) # 4-byte Folded Spill +; RV64ZFH-NEXT: lui a0, %hi(gh) +; RV64ZFH-NEXT: flh fs0, %lo(gh)(a0) +; RV64ZFH-NEXT: #APP +; RV64ZFH-NEXT: .insn 0x4, 0x04000053 | (0 << 7) | (10 << 15) | (8 << 20) +; RV64ZFH-NEXT: #NO_APP +; RV64ZFH-NEXT: fmv.h fa0, ft0 +; RV64ZFH-NEXT: flw fs0, 12(sp) # 4-byte Folded Reload +; RV64ZFH-NEXT: addi sp, sp, 16 +; RV64ZFH-NEXT: ret +; +; RV32DZFH-LABEL: constraint_f_half_abi_name: +; RV32DZFH: # %bb.0: +; RV32DZFH-NEXT: addi sp, sp, -16 +; RV32DZFH-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; RV32DZFH-NEXT: lui a0, %hi(gh) +; RV32DZFH-NEXT: flh fs0, %lo(gh)(a0) +; RV32DZFH-NEXT: #APP +; RV32DZFH-NEXT: .insn 0x4, 0x04000053 | (0 << 7) | (10 << 15) | (8 << 20) +; RV32DZFH-NEXT: #NO_APP +; RV32DZFH-NEXT: fmv.h fa0, ft0 +; RV32DZFH-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload +; RV32DZFH-NEXT: addi sp, sp, 16 +; RV32DZFH-NEXT: ret +; +; RV64DZFH-LABEL: constraint_f_half_abi_name: +; RV64DZFH: # %bb.0: +; RV64DZFH-NEXT: addi sp, sp, -16 +; RV64DZFH-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; RV64DZFH-NEXT: lui a0, %hi(gh) +; RV64DZFH-NEXT: flh fs0, %lo(gh)(a0) +; RV64DZFH-NEXT: #APP +; RV64DZFH-NEXT: .insn 0x4, 0x04000053 | (0 << 7) | (10 << 15) | (8 << 20) +; RV64DZFH-NEXT: #NO_APP +; RV64DZFH-NEXT: fmv.h fa0, ft0 +; RV64DZFH-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload +; RV64DZFH-NEXT: addi sp, sp, 16 +; RV64DZFH-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm ".insn 0x4, 0x04000053 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "={ft0},{fa0},{fs0}"(half %a, half %1) + ret half %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll new file mode 100644 index 000000000000..a8d3515fe189 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+zfinx -target-abi=ilp32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32FINX %s +; RUN: llc -mtriple=riscv64 -mattr=+zfinx -target-abi=lp64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64FINX %s + +@gf = external global float + +define float @constraint_r_float(float %a) nounwind { +; RV32FINX-LABEL: constraint_r_float: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a1, %hi(gf) +; RV32FINX-NEXT: lw a1, %lo(gf)(a1) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.s a0, a0, a1 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_r_float: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gf) +; RV64FINX-NEXT: lw a1, %lo(gf)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.s a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "=r,r,r"(float %a, float %1) + ret float %2 +} + +define float @constraint_cr_float(float %a) nounwind { +; RV32FINX-LABEL: constraint_cr_float: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a1, %hi(gf) +; RV32FINX-NEXT: lw a1, %lo(gf)(a1) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.s a0, a0, a1 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_cr_float: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gf) +; RV64FINX-NEXT: lw a1, %lo(gf)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.s a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "=^cr,cr,cr"(float %a, float %1) + ret float %2 +} + +define float @constraint_float_abi_name(float %a) nounwind { +; RV32FINX-LABEL: constraint_float_abi_name: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: addi sp, sp, -16 +; RV32FINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32FINX-NEXT: lui a1, %hi(gf) +; RV32FINX-NEXT: lw s0, %lo(gf)(a1) +; RV32FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.s t0, a0, s0 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: mv a0, t0 +; RV32FINX-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32FINX-NEXT: addi sp, sp, 16 +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_float_abi_name: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: addi sp, sp, -16 +; RV64FINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64FINX-NEXT: lui a1, %hi(gf) +; RV64FINX-NEXT: lw s0, %lo(gf)(a1) +; RV64FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.s t0, a0, s0 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: mv a0, t0 +; RV64FINX-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64FINX-NEXT: addi sp, sp, 16 +; RV64FINX-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "={t0},{a0},{s0}"(float %a, float %1) + ret float %2 +} + diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll new file mode 100644 index 000000000000..f9707c6c8995 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+zhinx -verify-machineinstrs < %s \ +; RUN: -target-abi=ilp32 | FileCheck -check-prefix=RV32ZHINX %s +; RUN: llc -mtriple=riscv64 -mattr=+zhinx -verify-machineinstrs < %s \ +; RUN: -target-abi=lp64 | FileCheck -check-prefix=RV64ZHINX %s +; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zhinx -verify-machineinstrs < %s \ +; RUN: -target-abi=ilp32 | FileCheck -check-prefix=RV32DINXZHINX %s +; RUN: llc -mtriple=riscv64 -mattr=+zdinx,+zhinx -verify-machineinstrs < %s \ +; RUN: -target-abi=lp64 | FileCheck -check-prefix=RV64DINXZHINX %s + +@gh = external global half + +define half @constraint_r_half(half %a) nounwind { +; RV32ZHINX-LABEL: constraint_r_half: +; RV32ZHINX: # %bb.0: +; RV32ZHINX-NEXT: lui a1, %hi(gh) +; RV32ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32ZHINX-NEXT: #APP +; RV32ZHINX-NEXT: fadd.h a0, a0, a1 +; RV32ZHINX-NEXT: #NO_APP +; RV32ZHINX-NEXT: ret +; +; RV64ZHINX-LABEL: constraint_r_half: +; RV64ZHINX: # %bb.0: +; RV64ZHINX-NEXT: lui a1, %hi(gh) +; RV64ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64ZHINX-NEXT: #APP +; RV64ZHINX-NEXT: fadd.h a0, a0, a1 +; RV64ZHINX-NEXT: #NO_APP +; RV64ZHINX-NEXT: ret +; +; RV32DINXZHINX-LABEL: constraint_r_half: +; RV32DINXZHINX: # %bb.0: +; RV32DINXZHINX-NEXT: lui a1, %hi(gh) +; RV32DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32DINXZHINX-NEXT: #APP +; RV32DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV32DINXZHINX-NEXT: #NO_APP +; RV32DINXZHINX-NEXT: ret +; +; RV64DINXZHINX-LABEL: constraint_r_half: +; RV64DINXZHINX: # %bb.0: +; RV64DINXZHINX-NEXT: lui a1, %hi(gh) +; RV64DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64DINXZHINX-NEXT: #APP +; RV64DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV64DINXZHINX-NEXT: #NO_APP +; RV64DINXZHINX-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.h $0, $1, $2", "=r,r,r"(half %a, half %1) + ret half %2 +} + +define half @constraint_cr_half(half %a) nounwind { +; RV32ZHINX-LABEL: constraint_cr_half: +; RV32ZHINX: # %bb.0: +; RV32ZHINX-NEXT: lui a1, %hi(gh) +; RV32ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32ZHINX-NEXT: #APP +; RV32ZHINX-NEXT: fadd.h a0, a0, a1 +; RV32ZHINX-NEXT: #NO_APP +; RV32ZHINX-NEXT: ret +; +; RV64ZHINX-LABEL: constraint_cr_half: +; RV64ZHINX: # %bb.0: +; RV64ZHINX-NEXT: lui a1, %hi(gh) +; RV64ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64ZHINX-NEXT: #APP +; RV64ZHINX-NEXT: fadd.h a0, a0, a1 +; RV64ZHINX-NEXT: #NO_APP +; RV64ZHINX-NEXT: ret +; +; RV32DINXZHINX-LABEL: constraint_cr_half: +; RV32DINXZHINX: # %bb.0: +; RV32DINXZHINX-NEXT: lui a1, %hi(gh) +; RV32DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32DINXZHINX-NEXT: #APP +; RV32DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV32DINXZHINX-NEXT: #NO_APP +; RV32DINXZHINX-NEXT: ret +; +; RV64DINXZHINX-LABEL: constraint_cr_half: +; RV64DINXZHINX: # %bb.0: +; RV64DINXZHINX-NEXT: lui a1, %hi(gh) +; RV64DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64DINXZHINX-NEXT: #APP +; RV64DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV64DINXZHINX-NEXT: #NO_APP +; RV64DINXZHINX-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.h $0, $1, $2", "=^cr,^cr,^cr"(half %a, half %1) + ret half %2 +} + +define half @constraint_half_abi_name(half %a) nounwind { +; RV32ZHINX-LABEL: constraint_half_abi_name: +; RV32ZHINX: # %bb.0: +; RV32ZHINX-NEXT: addi sp, sp, -16 +; RV32ZHINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZHINX-NEXT: lui a1, %hi(gh) +; RV32ZHINX-NEXT: lh s0, %lo(gh)(a1) +; RV32ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 +; RV32ZHINX-NEXT: #APP +; RV32ZHINX-NEXT: fadd.s t0, a0, s0 +; RV32ZHINX-NEXT: #NO_APP +; RV32ZHINX-NEXT: mv a0, t0 +; RV32ZHINX-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZHINX-NEXT: addi sp, sp, 16 +; RV32ZHINX-NEXT: ret +; +; RV64ZHINX-LABEL: constraint_half_abi_name: +; RV64ZHINX: # %bb.0: +; RV64ZHINX-NEXT: addi sp, sp, -16 +; RV64ZHINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64ZHINX-NEXT: lui a1, %hi(gh) +; RV64ZHINX-NEXT: lh s0, %lo(gh)(a1) +; RV64ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 +; RV64ZHINX-NEXT: #APP +; RV64ZHINX-NEXT: fadd.s t0, a0, s0 +; RV64ZHINX-NEXT: #NO_APP +; RV64ZHINX-NEXT: mv a0, t0 +; RV64ZHINX-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64ZHINX-NEXT: addi sp, sp, 16 +; RV64ZHINX-NEXT: ret +; +; RV32DINXZHINX-LABEL: constraint_half_abi_name: +; RV32DINXZHINX: # %bb.0: +; RV32DINXZHINX-NEXT: addi sp, sp, -16 +; RV32DINXZHINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32DINXZHINX-NEXT: lui a1, %hi(gh) +; RV32DINXZHINX-NEXT: lh s0, %lo(gh)(a1) +; RV32DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 +; RV32DINXZHINX-NEXT: #APP +; RV32DINXZHINX-NEXT: fadd.s t0, a0, s0 +; RV32DINXZHINX-NEXT: #NO_APP +; RV32DINXZHINX-NEXT: mv a0, t0 +; RV32DINXZHINX-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32DINXZHINX-NEXT: addi sp, sp, 16 +; RV32DINXZHINX-NEXT: ret +; +; RV64DINXZHINX-LABEL: constraint_half_abi_name: +; RV64DINXZHINX: # %bb.0: +; RV64DINXZHINX-NEXT: addi sp, sp, -16 +; RV64DINXZHINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64DINXZHINX-NEXT: lui a1, %hi(gh) +; RV64DINXZHINX-NEXT: lh s0, %lo(gh)(a1) +; RV64DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 +; RV64DINXZHINX-NEXT: #APP +; RV64DINXZHINX-NEXT: fadd.s t0, a0, s0 +; RV64DINXZHINX-NEXT: #NO_APP +; RV64DINXZHINX-NEXT: mv a0, t0 +; RV64DINXZHINX-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64DINXZHINX-NEXT: addi sp, sp, 16 +; RV64DINXZHINX-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.s $0, $1, $2", "={t0},{a0},{s0}"(half %a, half %1) + ret half %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm.ll b/llvm/test/CodeGen/RISCV/inline-asm.ll index cb240d2dc68d..79266743a1d0 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm.ll @@ -56,6 +56,29 @@ define i32 @constraint_r_zero(i32 %a) nounwind { ret i32 %2 } +define i32 @constraint_cr(i32 %a) nounwind { +; RV32I-LABEL: constraint_cr: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, %hi(gi) +; RV32I-NEXT: lw a1, %lo(gi)(a1) +; RV32I-NEXT: #APP +; RV32I-NEXT: c.add a0, a0, a1 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_cr: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, %hi(gi) +; RV64I-NEXT: lw a1, %lo(gi)(a1) +; RV64I-NEXT: #APP +; RV64I-NEXT: c.add a0, a0, a1 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %1 = load i32, ptr @gi + %2 = tail call i32 asm "c.add $0, $1, $2", "=^cr,0,^cr"(i32 %a, i32 %1) + ret i32 %2 +} + define i32 @constraint_i(i32 %a) nounwind { ; RV32I-LABEL: constraint_i: ; RV32I: # %bb.0: @@ -215,6 +238,49 @@ define i32 @modifier_i_reg(i32 %a, i32 %b) nounwind { ret i32 %1 } +;; `.insn 0x4, 0x33 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)` is the +;; raw encoding of `add` + +define i32 @modifier_N_reg(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: modifier_N_reg: +; RV32I: # %bb.0: +; RV32I-NEXT: #APP +; RV32I-NEXT: .insn 0x4, 0x33 | (10 << 7) | (10 << 15) | (11 << 20) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: modifier_N_reg: +; RV64I: # %bb.0: +; RV64I-NEXT: #APP +; RV64I-NEXT: .insn 0x4, 0x33 | (10 << 7) | (10 << 15) | (11 << 20) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %1 = tail call i32 asm ".insn 0x4, 0x33 | (${0:N} << 7) | (${1:N} << 15) | (${2:N} << 20)", "=r,r,r"(i32 %a, i32 %b) + ret i32 %1 +} + +;; `.insn 0x2, 0x9422 | (${0:N} << 7) | (${2:N} << 2)` is the raw encoding of +;; `c.add` (note the constraint that the first input should be the same as the +;; output). + +define i32 @modifier_N_with_cr_reg(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: modifier_N_with_cr_reg: +; RV32I: # %bb.0: +; RV32I-NEXT: #APP +; RV32I-NEXT: .insn 0x2, 0x9422 | (10 << 7) | (11 << 2) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: modifier_N_with_cr_reg: +; RV64I: # %bb.0: +; RV64I-NEXT: #APP +; RV64I-NEXT: .insn 0x2, 0x9422 | (10 << 7) | (11 << 2) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %1 = tail call i32 asm ".insn 0x2, 0x9422 | (${0:N} << 7) | (${2:N} << 2)", "=^cr,0,^cr"(i32 %a, i32 %b) + ret i32 %1 +} + define void @operand_global() nounwind { ; RV32I-LABEL: operand_global: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll index 95695aa69776..18bd41a210f5 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll @@ -1,6 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s \ ; RUN: -target-abi=ilp32 -mattr=+zhinx | FileCheck %s + +;; These tests cover the use of `r` and `cr` constraints for floating point values on rv32. +;; +;; In particular, there is significant complexity around using paired GPRs for double values on rv32. + define dso_local void @zdinx_asm(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind { ; CHECK-LABEL: zdinx_asm: ; CHECK: # %bb.0: # %entry @@ -50,3 +55,59 @@ entry: store half %0, ptr %arrayidx, align 8 ret void } + +define dso_local void @zdinx_asm_cr(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind { +; CHECK-LABEL: zdinx_asm_cr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: mv s1, a2 +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: #APP +; CHECK-NEXT: fsgnjx.d a2, s0, a4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sw a2, 8(a0) +; CHECK-NEXT: sw a3, 12(a0) +; CHECK-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +entry: + %arrayidx = getelementptr inbounds double, ptr %a, i32 1 + %0 = tail call double asm "fsgnjx.d $0, $1, $2", "=^cr,^cr,^cr"(double %b, double %c) + store double %0, ptr %arrayidx, align 8 + ret void +} + +define dso_local void @zfinx_asm_cr(ptr nocapture noundef writeonly %a, float noundef %b, float noundef %c) nounwind { +; CHECK-LABEL: zfinx_asm_cr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: fsgnjx.s a1, a1, a2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sw a1, 4(a0) +; CHECK-NEXT: ret +entry: + %arrayidx = getelementptr inbounds float, ptr %a, i32 1 + %0 = tail call float asm "fsgnjx.s $0, $1, $2", "=^cr,^cr,^cr"(float %b, float %c) + store float %0, ptr %arrayidx, align 8 + ret void +} + +define dso_local void @zhinx_asm_cr(ptr nocapture noundef writeonly %a, half noundef %b, half noundef %c) nounwind { +; CHECK-LABEL: zhinx_asm_cr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: fsgnjx.h a1, a1, a2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sh a1, 2(a0) +; CHECK-NEXT: ret +entry: + %arrayidx = getelementptr inbounds half, ptr %a, i32 1 + %0 = tail call half asm "fsgnjx.h $0, $1, $2", "=^cr,^cr,^cr"(half %b, half %c) + store half %0, ptr %arrayidx, align 8 + ret void +} -- GitLab From 19aa4c854a34a53ef9fa04bcbbc93761239a7234 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 05:42:57 -0400 Subject: [PATCH 029/511] [NFC] Fix implicit-fallthrough warnings (#112825) This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warnings: clang/lib/Sema/SemaFunctionEffects.cpp:1531:5: error: this statement may fall through [-Werror=implicit-fallthrough=] 1531 | switch (DiffKind) { | ^~~~~~ --- clang/lib/Sema/SemaFunctionEffects.cpp | 4 +++- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index 70e5d78661a8..3fa326db06ee 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -1540,6 +1540,7 @@ bool Sema::FunctionEffectDiff::shouldDiagnoseConversion( // matching is better. return true; } + break; case FunctionEffect::Kind::Blocking: case FunctionEffect::Kind::Allocating: return false; @@ -1563,6 +1564,7 @@ bool Sema::FunctionEffectDiff::shouldDiagnoseRedeclaration( // All these forms of mismatches are diagnosed. return true; } + break; case FunctionEffect::Kind::Blocking: case FunctionEffect::Kind::Allocating: return false; @@ -1592,7 +1594,7 @@ Sema::FunctionEffectDiff::shouldDiagnoseMethodOverride( case Kind::ConditionMismatch: return OverrideResult::Warn; } - + break; case FunctionEffect::Kind::Blocking: case FunctionEffect::Kind::Allocating: return OverrideResult::NoAction; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7f942de74bdc..93c2d92ef7c1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1028,6 +1028,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); } + break; } case NVPTX::Ordering::SequentiallyConsistent: { switch (S) { @@ -1046,6 +1047,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.", ScopeToString(S))); } + break; } case NVPTX::Ordering::NotAtomic: case NVPTX::Ordering::Relaxed: -- GitLab From 4e0169005ea53af90ee43562c0d41c113c8498cf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 10:39:59 +0100 Subject: [PATCH 030/511] [X86] Add FMA constant folding test coverage Shows we constant fold scalars but not vectors --- llvm/test/CodeGen/X86/fma.ll | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index c55f50e97786..4f5e9af97bc8 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -2096,6 +2096,190 @@ entry: ret <8 x double> %call } +define float @constant_fold_f32() { +; FMA32-LABEL: constant_fold_f32: +; FMA32: ## %bb.0: +; FMA32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ## encoding: [0xd9,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 2, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: retl ## encoding: [0xc3] +; +; FMACALL32-LABEL: constant_fold_f32: +; FMACALL32: ## %bb.0: +; FMACALL32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ## encoding: [0xd9,0x05,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 2, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; +; FMA64-LABEL: constant_fold_f32: +; FMA64: ## %bb.0: +; FMA64-NEXT: vmovss {{.*#+}} xmm0 = [1.02E+3,0.0E+0,0.0E+0,0.0E+0] +; FMA64-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: retq ## encoding: [0xc3] +; +; FMACALL64-LABEL: constant_fold_f32: +; FMACALL64: ## %bb.0: +; FMACALL64-NEXT: movss {{.*#+}} xmm0 = [1.02E+3,0.0E+0,0.0E+0,0.0E+0] +; FMACALL64-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; +; AVX512-LABEL: constant_fold_f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [1.02E+3,0.0E+0,0.0E+0,0.0E+0] +; AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: retq ## encoding: [0xc3] +; +; AVX512VL-LABEL: constant_fold_f32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = [1.02E+3,0.0E+0,0.0E+0,0.0E+0] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: retq ## encoding: [0xc3] + %r = call float @llvm.fma.f32(float 5.000000e+01, float 2.000000e+01, float 2.000000e+01) + ret float %r +} + +define <4 x float> @constant_fold_v4f32() { +; FMA32-LABEL: constant_fold_v4f32: +; FMA32: ## %bb.0: +; FMA32-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] +; FMA32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; FMA32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: ## xmm0 = (xmm1 * xmm0) + mem +; FMA32-NEXT: retl ## encoding: [0xc3] +; +; FMACALL32-LABEL: constant_fold_v4f32: +; FMACALL32: ## %bb.0: +; FMACALL32-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; +; FMA64-LABEL: constant_fold_v4f32: +; FMA64: ## %bb.0: +; FMA64-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] +; FMA64-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; FMA64-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: ## xmm0 = (xmm1 * xmm0) + mem +; FMA64-NEXT: retq ## encoding: [0xc3] +; +; FMACALL64-LABEL: constant_fold_v4f32: +; FMACALL64: ## %bb.0: +; FMACALL64-NEXT: movaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; +; AVX512-LABEL: constant_fold_v4f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] +; AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: ## xmm0 = (xmm1 * xmm0) + mem +; AVX512-NEXT: retq ## encoding: [0xc3] +; +; AVX512VL-LABEL: constant_fold_v4f32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: ## xmm0 = (xmm1 * xmm0) + mem +; AVX512VL-NEXT: retq ## encoding: [0xc3] + %r = call <4 x float> @llvm.fma.v4f32(<4 x float> , <4 x float> , <4 x float> ) + ret <4 x float> %r +} + +define <2 x double> @constant_fold_v2f64() { +; FMA32-LABEL: constant_fold_v2f64: +; FMA32: ## %bb.0: +; FMA32-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] +; FMA32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] +; FMA32-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMA32-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 +; FMA32-NEXT: retl ## encoding: [0xc3] +; +; FMACALL32-LABEL: constant_fold_v2f64: +; FMACALL32: ## %bb.0: +; FMACALL32-NEXT: vmovaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; FMACALL32-NEXT: retl ## encoding: [0xc3] +; +; FMA64-LABEL: constant_fold_v2f64: +; FMA64: ## %bb.0: +; FMA64-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] +; FMA64-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] +; FMA64-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMA64-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 +; FMA64-NEXT: retq ## encoding: [0xc3] +; +; FMACALL64-LABEL: constant_fold_v2f64: +; FMACALL64: ## %bb.0: +; FMACALL64-NEXT: movaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; FMACALL64-NEXT: retq ## encoding: [0xc3] +; +; AVX512-LABEL: constant_fold_v2f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] +; AVX512-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] +; AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 +; AVX512-NEXT: retq ## encoding: [0xc3] +; +; AVX512VL-LABEL: constant_fold_v2f64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 +; AVX512VL-NEXT: retq ## encoding: [0xc3] + %r = call <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> , <2 x double> ) + ret <2 x double> %r +} + declare float @llvm.fma.f32(float, float, float) declare double @llvm.fma.f64(double, double, double) declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) -- GitLab From 8f6d4913bbc4ad9ba9c139b8ce6dd69058435d17 Mon Sep 17 00:00:00 2001 From: JL2210 Date: Fri, 18 Oct 2024 05:50:44 -0400 Subject: [PATCH 031/511] [llvm][TableGen] Count implicit defs as well as explicit ones in the GlobalISel TableGen emitter (#112673) `NumDefs` only counts the number of registers in `(outs)`, not any implicit defs specified with `Defs = [...]` This causes patterns with physical register defs to fail to import here instead of later where implicit defs are rendered. Add on `ImplicitDefs.size()` to count both and create `DstExpDefs` to count only explicit defs, used later on. --- .../test/TableGen/GlobalISelEmitter-implicit-defs.td | 12 ++++++++++++ llvm/utils/TableGen/GlobalISelEmitter.cpp | 7 +++++-- 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td diff --git a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td b/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td new file mode 100644 index 000000000000..79af1a336f28 --- /dev/null +++ b/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td @@ -0,0 +1,12 @@ +// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o /dev/null 2>&1 < %s | FileCheck %s --implicit-check-not="Skipped pattern" + +include "llvm/Target/Target.td" +include "GlobalISelEmitterCommon.td" + +// CHECK: Skipped pattern: Pattern defines a physical register +let Uses = [B0], Defs = [B0] in +def tst1 : I<(outs), (ins), [(set B0, (add B0, 1))]>; + +// CHECK: Skipped pattern: Src pattern result has 1 def(s) without the HasNoUse predicate set to true but Dst MI has no def +let Uses = [B0] in +def tst2 : I<(outs), (ins), [(set B0, (add B0, 1))]>; diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index c53f705a38db..29c64ba95ff8 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2023,7 +2023,10 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { auto &DstI = Target.getInstruction(DstOp); StringRef DstIName = DstI.TheDef->getName(); - unsigned DstNumDefs = DstI.Operands.NumDefs, + // Count both implicit and explicit defs in the dst instruction. + // This avoids errors importing patterns that have inherent implicit defs. + unsigned DstExpDefs = DstI.Operands.NumDefs, + DstNumDefs = DstI.ImplicitDefs.size() + DstExpDefs, SrcNumDefs = Src.getExtTypes().size(); if (DstNumDefs < SrcNumDefs) { if (DstNumDefs != 0) @@ -2045,7 +2048,7 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // The root of the match also has constraints on the register bank so that it // matches the result instruction. unsigned OpIdx = 0; - unsigned N = std::min(DstNumDefs, SrcNumDefs); + unsigned N = std::min(DstExpDefs, SrcNumDefs); for (unsigned I = 0; I < N; ++I) { const TypeSetByHwMode &VTy = Src.getExtType(I); -- GitLab From a630771b28f4b252e2754776b8f3ab416133951a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 10:52:55 +0100 Subject: [PATCH 032/511] [DAG] isConstantIntBuildVectorOrConstantInt - peek through bitcasts (#112710) Alter both isConstantIntBuildVectorOrConstantInt + isConstantFPBuildVectorOrConstantFP to return a bool instead of the underlying SDNode, and adjust usage to account for this. Update isConstantIntBuildVectorOrConstantInt to peek though bitcasts when attempting to find a constant, in particular this improves canonicalization of constants to the RHS on commutable instructions. X86 is the beneficiary here as it often bitcasts rematerializable 0/-1 vector constants as vXi32 and bitcasts to the requested type Minor cleanup that helps with #107423 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 5 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 +-- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 47 +-- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/test/CodeGen/X86/avx2-arith.ll | 2 +- llvm/test/CodeGen/X86/combine-sra.ll | 9 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 50 ++-- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 60 ++-- .../CodeGen/X86/min-legal-vector-width.ll | 18 +- llvm/test/CodeGen/X86/pmul.ll | 62 ++-- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 2 +- llvm/test/CodeGen/X86/psubus.ll | 81 +++-- llvm/test/CodeGen/X86/sat-add.ll | 4 +- .../X86/vector-shuffle-combining-sse41.ll | 6 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 275 ++++++++--------- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 275 ++++++++--------- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 279 ++++++++---------- 18 files changed, 570 insertions(+), 650 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b8f80738486a..12ff36c89e33 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2301,10 +2301,11 @@ public: Align getEVTAlign(EVT MemoryVT) const; /// Test whether the given value is a constant int or similar node. - SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const; + bool isConstantIntBuildVectorOrConstantInt(SDValue N, + bool AllowOpaques = true) const; /// Test whether the given value is a constant FP or similar node. - SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ; + bool isConstantFPBuildVectorOrConstantFP(SDValue N) const; /// \returns true if \p N is any kind of constant or build_vector of /// constants, int or float. If a vector, it may not necessarily be a splat. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9946cf94a7c7..3ff6ad28dc6a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1205,13 +1205,13 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) { + if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) { SDNodeFlags NewFlags; if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) NewFlags.setNoUnsignedWrap(true); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) { + if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1})) return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags); @@ -9931,10 +9931,10 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { // fold (rot* (rot* x, c2), c1) // -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { - SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); - SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); - if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { - EVT ShiftVT = C1->getValueType(0); + bool C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); + bool C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); + if (C1 && C2 && N1.getValueType() == N0.getOperand(1).getValueType()) { + EVT ShiftVT = N1.getValueType(); bool SameSide = (N->getOpcode() == NextOp); unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); @@ -16805,8 +16805,8 @@ SDValue DAGCombiner::visitVP_FADD(SDNode *N) { SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); - SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); + bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); + bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -16903,10 +16903,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { - SDNode *CFP00 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); - SDNode *CFP01 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { @@ -16926,10 +16924,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FMUL) { - SDNode *CFP10 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); - SDNode *CFP11 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { @@ -16949,8 +16945,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N0.getOpcode() == ISD::FADD) { - SDNode *CFP00 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { @@ -16960,8 +16955,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FADD) { - SDNode *CFP10 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 66c078b1d35b..43d49674297f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6962,10 +6962,10 @@ void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1, // Canonicalize: // binop(const, nonconst) -> binop(nonconst, const) - SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1); - SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2); - SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1); - SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2); + bool N1C = isConstantIntBuildVectorOrConstantInt(N1); + bool N2C = isConstantIntBuildVectorOrConstantInt(N2); + bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); + bool N2CFP = isConstantFPBuildVectorOrConstantFP(N2); if ((N1C && !N2C) || (N1CFP && !N2CFP)) std::swap(N1, N2); @@ -13197,39 +13197,44 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { return true; } -// Returns the SDNode if it is a constant integer BuildVector -// or constant integer. -SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const { - if (isa(N)) - return N.getNode(); +// Returns true if it is a constant integer BuildVector or constant integer, +// possibly hidden by a bitcast. +bool SelectionDAG::isConstantIntBuildVectorOrConstantInt( + SDValue N, bool AllowOpaques) const { + N = peekThroughBitcasts(N); + + if (auto *C = dyn_cast(N)) + return AllowOpaques || !C->isOpaque(); + if (ISD::isBuildVectorOfConstantSDNodes(N.getNode())) - return N.getNode(); + return true; + // Treat a GlobalAddress supporting constant offset folding as a // constant integer. - if (GlobalAddressSDNode *GA = dyn_cast(N)) + if (auto *GA = dyn_cast(N)) if (GA->getOpcode() == ISD::GlobalAddress && TLI->isOffsetFoldingLegal(GA)) - return GA; + return true; + if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return N.getNode(); - return nullptr; + return true; + return false; } -// Returns the SDNode if it is a constant float BuildVector -// or constant float. -SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { +// Returns true if it is a constant float BuildVector or constant float. +bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { if (isa(N)) - return N.getNode(); + return true; if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) - return N.getNode(); + return true; if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return N.getNode(); + return true; - return nullptr; + return false; } std::optional SelectionDAG::isBoolConstant(SDValue N, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d5466e0a1cbd..7448416c682a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20760,7 +20760,7 @@ static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) { if (!Add.hasOneUse()) return SDValue(); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X))) + if (DAG.isConstantIntBuildVectorOrConstantInt(X)) return SDValue(); SDValue M1 = Add.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 102789a3e952..ff66eb15508c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56543,14 +56543,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); - // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { - if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) { - if (auto *Cst = dyn_cast(C)) - return !Cst->isOpaque(); - return true; - } - return false; + return DAG.isConstantIntBuildVectorOrConstantInt(Op, + /*AllowOpaques*/ false); }; // X86 can't encode an immediate LHS of a sub. See if we can push the diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 90733dfb8465..44ab33ad67f2 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 ; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 ; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index 7eee418742dd..c982884314f6 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -725,12 +725,11 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519] -; SSE41-NEXT: movdqa %xmm8, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 6fd3db3464de..ee83a79b6dd5 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 ; SSE41-NEXT: pand %xmm2, %xmm5 @@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3139,7 +3139,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3287,8 +3287,8 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3311,7 +3311,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3356,7 +3356,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3376,7 +3376,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 5a1c4c8a52c8..b4e8f0a23016 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1914,7 +1914,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -1922,7 +1922,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -1944,7 +1944,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1974,14 +1974,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -1999,7 +1999,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2088,7 +2088,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 @@ -2096,7 +2096,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2120,7 +2120,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2150,14 +2150,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2176,7 +2176,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2266,7 +2266,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2274,7 +2274,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2297,7 +2297,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2328,14 +2328,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2354,7 +2354,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2444,7 +2444,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2452,7 +2452,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2475,7 +2475,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2506,14 +2506,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2532,7 +2532,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2623,7 +2623,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2631,7 +2631,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2655,7 +2655,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2687,14 +2687,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2714,7 +2714,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 8289e885618f..9b08d8baacee 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -892,13 +892,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -913,13 +913,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 @@ -939,13 +939,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -967,7 +967,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 @@ -980,7 +980,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 @@ -997,7 +997,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 6c3d04863118..fe8a4fa16312 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -161,8 +161,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE41-NEXT: pand %xmm2, %xmm4 @@ -586,17 +586,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 ; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm5 @@ -609,7 +608,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -621,7 +620,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -902,37 +901,34 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pand %xmm4, %xmm9 +; SSE41-NEXT: pandn %xmm4, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm9, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 ; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pandn %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 @@ -945,14 +941,14 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 @@ -963,28 +959,28 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 ; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index c9bb3de92dcd..885b07585e68 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -59,7 +59,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index be8adf697d5c..9656822d144e 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1671,12 +1671,11 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pxor %xmm9, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 @@ -1684,22 +1683,20 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] ; SSE41-NEXT: movapd %xmm8, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2771,12 +2768,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pxor %xmm10, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -2784,11 +2780,10 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2797,11 +2792,10 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm7 ; SSE41-NEXT: psubd %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2997,12 +2991,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pxor %xmm10, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -3010,11 +3003,10 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -3023,11 +3015,10 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index 949902a5ebc4..b12be7cb129d 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -631,8 +631,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index efe34c52b371..d3e4906450e4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -84,8 +84,8 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 @@ -120,7 +120,7 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 5568604ac29a..0af5e9aeccd9 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -57,8 +57,8 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -175,8 +175,8 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -317,12 +317,12 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -330,8 +330,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -584,35 +584,32 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -620,8 +617,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -828,8 +825,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -971,8 +968,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1143,12 +1140,12 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1156,8 +1153,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1333,12 +1330,12 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1346,8 +1343,8 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1583,35 +1580,32 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1619,8 +1613,8 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -2239,8 +2233,8 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2393,8 +2387,8 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2539,12 +2533,12 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2552,8 +2546,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2733,12 +2727,12 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2746,8 +2740,8 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2987,35 +2981,32 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3023,8 +3014,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3277,35 +3268,32 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3313,8 +3301,8 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3677,79 +3665,72 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm9, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm2, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: pxor %xmm2, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3757,8 +3738,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d276a6873012..3c03c521c272 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -59,8 +59,8 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -182,8 +182,8 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -334,12 +334,12 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -347,8 +347,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -604,35 +604,32 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -640,8 +637,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -849,8 +846,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -983,8 +980,8 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1149,12 +1146,12 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1162,8 +1159,8 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1333,12 +1330,12 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1346,8 +1343,8 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1579,35 +1576,32 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1615,8 +1609,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -2002,8 +1996,8 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2148,8 +2142,8 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2288,12 +2282,12 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2301,8 +2295,8 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2476,12 +2470,12 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2489,8 +2483,8 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2726,35 +2720,32 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -2762,8 +2753,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3022,35 +3013,32 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3058,8 +3046,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3430,79 +3418,72 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm9, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm2, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: pxor %xmm2, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3510,8 +3491,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 412661693747..c1d22dc7daf2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -207,20 +207,20 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -407,34 +407,31 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -790,26 +787,25 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -924,26 +920,25 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 +; SSE41-NEXT: packusdw %xmm6, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rdi) ; SSE41-NEXT: retq @@ -1094,34 +1089,31 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -1869,26 +1861,25 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -2005,26 +1996,25 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 +; SSE41-NEXT: packusdw %xmm6, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm4 ; SSE41-NEXT: movd %xmm4, (%rdi) @@ -2175,34 +2165,31 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2360,34 +2347,31 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: packusdw %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2602,44 +2586,40 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm11 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm13 +; SSE41-NEXT: pxor %xmm7, %xmm13 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm13 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 ; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm7, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm12 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pxor %xmm7, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm11 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm11, %xmm0 @@ -2647,32 +2627,29 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 ; SSE41-NEXT: packusdw %xmm12, %xmm11 ; SSE41-NEXT: packusdw %xmm11, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pxor %xmm7, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm7, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 ; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -- GitLab From 6ce4b6dd070d9444c2a6761554d21495ba17213c Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 18 Oct 2024 11:00:55 +0100 Subject: [PATCH 033/511] [flang][OpenMP][test] re-add complex atomic capture regression test (#112736) This was reverted in https://github.com/llvm/llvm-project/pull/110969 due to a failure on aarch64. Weirdly aarch64 (but apparently not x86?) has a spurious phi instruction. flang -fc1 -emit-llvm will run midle-end optimization passes. Presumably one of those is behaving differently on different targets. I have adapted the test to work correctly on aarch64. The difference is in the RUN lines and the atomic exit block. --- .../OpenMP/atomic-capture-complex.f90 | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 flang/test/Integration/OpenMP/atomic-capture-complex.f90 diff --git a/flang/test/Integration/OpenMP/atomic-capture-complex.f90 b/flang/test/Integration/OpenMP/atomic-capture-complex.f90 new file mode 100644 index 000000000000..4ffd18097d79 --- /dev/null +++ b/flang/test/Integration/OpenMP/atomic-capture-complex.f90 @@ -0,0 +1,50 @@ +!===----------------------------------------------------------------------===! +! This directory can be used to add Integration tests involving multiple +! stages of the compiler (for eg. from Fortran to LLVM IR). It should not +! contain executable tests. We should only add tests here sparingly and only +! if there is no other way to test. Repeat this message in each test that is +! added to this directory and sub-directories. +!===----------------------------------------------------------------------===! + +!RUN: %if x86-registered-target %{ %flang_fc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fopenmp %s -o - | FileCheck --check-prefixes=CHECK,X86 %s %} +!RUN: %if aarch64-registerd-target %{ %flang_fc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fopenmp %s -o - | FileCheck --check-prefixes=CHECK,AARCH64 %s %} + +!CHECK: %[[X_NEW_VAL:.*]] = alloca { float, float }, align 8 +!CHECK: %[[VAL_1:.*]] = alloca { float, float }, i64 1, align 8 +!CHECK: %[[ORIG_VAL:.*]] = alloca { float, float }, i64 1, align 8 +!CHECK: store { float, float } { float 2.000000e+00, float 2.000000e+00 }, ptr %[[ORIG_VAL]], align 4 +!CHECK: br label %entry + +!CHECK: entry: +!CHECK: %[[ATOMIC_TEMP_LOAD:.*]] = alloca { float, float }, align 8 +!CHECK: call void @__atomic_load(i64 8, ptr %[[ORIG_VAL]], ptr %[[ATOMIC_TEMP_LOAD]], i32 0) +!CHECK: %[[PHI_NODE_ENTRY_1:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 8 +!CHECK: br label %.atomic.cont + +!CHECK: .atomic.cont +!CHECK: %[[VAL_4:.*]] = phi { float, float } [ %[[PHI_NODE_ENTRY_1]], %entry ], [ %{{.*}}, %.atomic.cont ] +!CHECK: %[[VAL_5:.*]] = extractvalue { float, float } %[[VAL_4]], 0 +!CHECK: %[[VAL_6:.*]] = extractvalue { float, float } %[[VAL_4]], 1 +!CHECK: %[[VAL_7:.*]] = fadd contract float %[[VAL_5]], 1.000000e+00 +!CHECK: %[[VAL_8:.*]] = fadd contract float %[[VAL_6]], 1.000000e+00 +!CHECK: %[[VAL_9:.*]] = insertvalue { float, float } undef, float %[[VAL_7]], 0 +!CHECK: %[[VAL_10:.*]] = insertvalue { float, float } %[[VAL_9]], float %[[VAL_8]], 1 +!CHECK: store { float, float } %[[VAL_10]], ptr %[[X_NEW_VAL]], align 4 +!CHECK: %[[VAL_11:.*]] = call i1 @__atomic_compare_exchange(i64 8, ptr %[[ORIG_VAL]], ptr %[[ATOMIC_TEMP_LOAD]], ptr %[[X_NEW_VAL]], +!i32 2, i32 2) +!CHECK: %[[VAL_12:.*]] = load { float, float }, ptr %[[ATOMIC_TEMP_LOAD]], align 4 +!CHECK: br i1 %[[VAL_11]], label %.atomic.exit, label %.atomic.cont + +!CHECK: .atomic.exit +!AARCH64: %[[LCSSA:.*]] = phi { float, float } [ %[[VAL_10]], %.atomic.cont ] +!AARCH64: store { float, float } %[[LCSSA]], ptr %[[VAL_1]], align 4 +!X86: store { float, float } %[[VAL_10]], ptr %[[VAL_1]], align 4 + +program main + complex*8 ia, ib + ia = (2, 2) + !$omp atomic capture + ia = ia + (1, 1) + ib = ia + !$omp end atomic +end program -- GitLab From 091a235ec5e6d5a7a8374b1265a9161c24be3513 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 18 Oct 2024 11:05:55 +0100 Subject: [PATCH 034/511] Revert "[AArch64][SVE] Enable max vector bandwidth for SVE" (#112873) Reverts llvm/llvm-project#109671 Reverting due to some performance regressions on neoverse-v1. --- .../AArch64/AArch64TargetTransformInfo.cpp | 6 +- .../AArch64/conditional-branches-cost.ll | 80 ++++-------- .../scalable-vectorization-cost-tuning.ll | 12 +- .../AArch64/scalable-vectorization.ll | 4 +- .../LoopVectorize/AArch64/store-costs-sve.ll | 119 +++++++----------- .../LoopVectorize/AArch64/sve2-histcnt.ll | 88 +++++-------- .../AArch64/type-shrinkage-zext-costs.ll | 28 ++--- .../AArch64/wider-VF-for-callinst.ll | 2 +- 8 files changed, 126 insertions(+), 213 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7c6b789b9c1b..ff3c69f7e10c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -337,10 +337,8 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { assert(K != TargetTransformInfo::RGK_Scalar); - return ((K == TargetTransformInfo::RGK_FixedWidthVector && - ST->isNeonAvailable()) || - (K == TargetTransformInfo::RGK_ScalableVector && - ST->isSVEorStreamingSVEAvailable())); + return (K == TargetTransformInfo::RGK_FixedWidthVector && + ST->isNeonAvailable()); } /// Calculate the cost of materializing a 64-bit value. This helper diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 01fca39296da..7f325ce1a1f0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -732,20 +732,9 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: entry: -; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: -; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 -; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]] -; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] -; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC]], 8 -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] -; DEFAULT-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2 -; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 +; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048 ; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; DEFAULT: vector.body: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -753,39 +742,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP9:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP10:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP11:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP12:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP13:%.*]] = uitofp [[TMP9]] to -; DEFAULT-NEXT: [[TMP14:%.*]] = uitofp [[TMP10]] to -; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp [[TMP11]] to -; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp [[TMP12]] to +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], +; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double> ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0 -; DEFAULT-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]] -; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]] -; DEFAULT-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24 -; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]] -; DEFAULT-NEXT: store [[TMP13]], ptr [[TMP4]], align 8 -; DEFAULT-NEXT: store [[TMP14]], ptr [[TMP20]], align 8 -; DEFAULT-NEXT: store [[TMP15]], ptr [[TMP23]], align 8 -; DEFAULT-NEXT: store [[TMP16]], ptr [[TMP26]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; DEFAULT-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] -; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: ; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ] -; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: vector.scevcheck: ; DEFAULT-NEXT: unreachable @@ -810,7 +780,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]] ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] @@ -819,31 +789,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] ; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2 ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]] ; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]] ; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257) +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257) ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; PRED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 ; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] ; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to ; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0 -; PRED-NEXT: call void @llvm.masked.store.nxv8f64.p0( [[TMP14]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; PRED-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; PRED-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll index 59da1e10fd2a..f28f77bf1b15 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -1,23 +1,23 @@ ; REQUIRES: asserts ; RUN: opt -mtriple=aarch64 -mattr=+sve \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16 +; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \ ; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16 +; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4 ; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2). ; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2). @@ -29,7 +29,7 @@ ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). ; VF-4: <4 x i32> -; VF-VSCALE16: +; VF-VSCALE4: <16 x i32> define void @test0(ptr %a, ptr %b, ptr %c) #0 { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll index a84932a2290d..e83eb729b521 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -8,8 +8,8 @@ ; (maximized bandwidth for i8 in the loop). define void @test0(ptr %a, ptr %b, ptr %c) #0 { ; CHECK: LV: Checking a loop in 'test0' -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16 +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index a4861ad0b261..7d2fc348480a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -145,7 +145,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: iter.check: ; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; DEFAULT: vector.memcheck: @@ -155,72 +155,59 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; DEFAULT-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP9]], 32 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 0, [[TMP3]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: -; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 -; DEFAULT-NEXT: [[N_MOD_VF1:%.*]] = urem i64 0, [[TMP5]] -; DEFAULT-NEXT: [[N_VEC1:%.*]] = sub i64 0, [[N_MOD_VF1]] -; DEFAULT-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 32 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP8:%.*]] = trunc [[BROADCAST_SPLAT]] to +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8> ; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; DEFAULT: vector.body: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT3]] to -; DEFAULT-NEXT: [[TMP22:%.*]] = and [[TMP11]], [[TMP8]] -; DEFAULT-NEXT: [[TMP13:%.*]] = and [[TMP11]], [[TMP8]] +; DEFAULT-NEXT: [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8> +; DEFAULT-NEXT: [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] +; DEFAULT-NEXT: [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] ; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] ; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 -; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP23:%.*]] = mul i64 [[TMP16]], 16 -; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP23]] -; DEFAULT-NEXT: store [[TMP22]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] -; DEFAULT-NEXT: store [[TMP13]], ptr [[TMP24]], align 1, !alias.scope [[META8]], !noalias [[META5]] -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC1]] -; DEFAULT-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 +; DEFAULT-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] +; DEFAULT-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; DEFAULT-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: [[CMP_N1:%.*]] = icmp eq i64 0, [[N_VEC1]] -; DEFAULT-NEXT: br i1 [[CMP_N1]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 0, [[N_VEC1]] ; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP31:%.*]] = mul i64 [[TMP15]], 8 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP31]] +; DEFAULT-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]] ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; DEFAULT: vec.epilog.ph: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 8 +; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 ; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]] ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] ; DEFAULT-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP32:%.*]] = trunc [[BROADCAST_SPLAT7]] to +; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP24:%.*]] = trunc [[BROADCAST_SPLAT7]] to ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX5]], 0 -; DEFAULT-NEXT: [[TMP33:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP33]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP29:%.*]] = trunc [[BROADCAST_SPLAT10]] to -; DEFAULT-NEXT: [[TMP30:%.*]] = and [[TMP29]], [[TMP32]] +; DEFAULT-NEXT: [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP22]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP23:%.*]] = trunc [[BROADCAST_SPLAT]] to +; DEFAULT-NEXT: [[TMP25:%.*]] = and [[TMP23]], [[TMP24]] ; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]] ; DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; DEFAULT-NEXT: store [[TMP30]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] +; DEFAULT-NEXT: store [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] ; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]] ; DEFAULT-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -228,7 +215,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -247,10 +234,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-LABEL: define void @trunc_store( ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; PRED-NEXT: entry: -; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP7]], 16 -; PRED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] -; PRED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; PRED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; PRED: vector.memcheck: ; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] @@ -258,35 +242,28 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; PRED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: -; PRED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; PRED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP4]], 16 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP12:%.*]] = trunc [[BROADCAST_SPLAT]] to +; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer +; PRED-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8> ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PRED-NEXT: [[TMP9:%.*]] = trunc [[BROADCAST_SPLAT2]] to -; PRED-NEXT: [[TMP10:%.*]] = and [[TMP9]], [[TMP12]] +; PRED-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; PRED-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8> +; PRED-NEXT: [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]] ; PRED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; PRED-NEXT: store [[TMP10]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; PRED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; PRED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; PRED: middle.block: -; PRED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; PRED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 04ac89518502..0c41477f285d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -313,68 +313,36 @@ for.exit: define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { ; CHECK-LABEL: define void @histogram_8bit( ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: iter.check: +; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -16 +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP10]] -; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8( [[TMP20]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP6]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i8( [[TMP7]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP12]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTNEG8:%.*]] = mul nsw i64 [[TMP13]], -8 -; CHECK-NEXT: [[N_VEC3:%.*]] = and i64 [[N]], [[DOTNEG8]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP16]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP17]] -; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv8p0.i8( [[TMP18]], i8 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY2:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY2]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] ; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 @@ -384,7 +352,7 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) ; CHECK-NEXT: store i8 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY2]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -425,7 +393,7 @@ define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N ; CHECK-NEXT: store float [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -468,7 +436,7 @@ define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %ind ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -526,7 +494,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP21]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -544,7 +512,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -596,7 +564,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP11]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -614,7 +582,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -666,7 +634,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP7]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -684,7 +652,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -733,13 +701,13 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -806,7 +774,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -827,7 +795,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; @@ -919,7 +887,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64( [[TMP6]], i64 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -936,7 +904,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: store i64 [[INC]], ptr [[GEP_BUCKET]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index 691c743be7d7..dec3c286345a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -24,25 +24,25 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -107,25 +107,25 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index a1a13f1e0c37..4a2f9d07ed91 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE -; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth=false -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW +; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW target triple = "aarch64-unknown-linux-gnu" -- GitLab From 9065b759ae73fac5edc01fc6c3878642bedfca5d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Oct 2024 12:05:50 +0200 Subject: [PATCH 035/511] Revert "[APInt] Enable APInt ctor assertion by default (#112670)" This reverts commit ba1ee2bab7a4cdc0975686e5099461c0b12c5345. Causes some buildbot failures on aarch64. --- llvm/include/llvm/ADT/APInt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 953b2a27b715..63a138527b32 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -109,7 +109,7 @@ public: /// \param implicitTrunc allow implicit truncation of non-zero/sign bits of /// val beyond the range of numBits APInt(unsigned numBits, uint64_t val, bool isSigned = false, - bool implicitTrunc = false) + bool implicitTrunc = true) : BitWidth(numBits) { if (!implicitTrunc) { if (isSigned) { -- GitLab From 5c37316b54ae763b3dacb6f1e8e1e94348ab4512 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 11:11:56 +0100 Subject: [PATCH 036/511] [DAG] visitFMA/FMAD - use FoldConstantArithmetic to add missing vector constant folding support --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 ++-- llvm/test/CodeGen/X86/fma.ll | 72 ++++--------------- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3ff6ad28dc6a..ea869371ae10 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17368,11 +17368,9 @@ template SDValue DAGCombiner::visitFMA(SDNode *N) { MatchContextClass matcher(DAG, TLI, N); // Constant fold FMA. - if (isa(N0) && - isa(N1) && - isa(N2)) { - return matcher.getNode(ISD::FMA, DL, VT, N0, N1, N2); - } + if (SDValue C = + DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1, N2})) + return C; // (-N0 * -N1) + N2 --> (N0 * N1) + N2 TargetLowering::NegatibleCost CostN0 = @@ -17488,9 +17486,8 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) { SDLoc DL(N); // Constant fold FMAD. - if (isa(N0) && isa(N1) && - isa(N2)) - return DAG.getNode(ISD::FMAD, DL, VT, N0, N1, N2); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMAD, DL, VT, {N0, N1, N2})) + return C; return SDValue(); } diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index 4f5e9af97bc8..f26960b069b0 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -2143,15 +2143,9 @@ define float @constant_fold_f32() { define <4 x float> @constant_fold_v4f32() { ; FMA32-LABEL: constant_fold_v4f32: ; FMA32: ## %bb.0: -; FMA32-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] -; FMA32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] -; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; FMA32-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] ; FMA32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] -; FMA32-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: ## xmm0 = (xmm1 * xmm0) + mem ; FMA32-NEXT: retl ## encoding: [0xc3] ; ; FMACALL32-LABEL: constant_fold_v4f32: @@ -2163,15 +2157,9 @@ define <4 x float> @constant_fold_v4f32() { ; ; FMA64-LABEL: constant_fold_v4f32: ; FMA64: ## %bb.0: -; FMA64-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] -; FMA64-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] -; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; FMA64-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] ; FMA64-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] -; FMA64-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: ## xmm0 = (xmm1 * xmm0) + mem ; FMA64-NEXT: retq ## encoding: [0xc3] ; ; FMACALL64-LABEL: constant_fold_v4f32: @@ -2183,28 +2171,16 @@ define <4 x float> @constant_fold_v4f32() { ; ; AVX512-LABEL: constant_fold_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] -; AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] ; AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: ## xmm0 = (xmm1 * xmm0) + mem ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; AVX512VL-LABEL: constant_fold_v4f32: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,1.0E+1,2.0E+1,3.0E+1] -; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0d,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+1,5.0E+1,6.0E+1,7.0E+1] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,4.9E+2,1.18E+3,2.07E+3] ; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: vfmadd213ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: ## xmm0 = (xmm1 * xmm0) + mem ; AVX512VL-NEXT: retq ## encoding: [0xc3] %r = call <4 x float> @llvm.fma.v4f32(<4 x float> , <4 x float> , <4 x float> ) ret <4 x float> %r @@ -2213,15 +2189,9 @@ define <4 x float> @constant_fold_v4f32() { define <2 x double> @constant_fold_v2f64() { ; FMA32-LABEL: constant_fold_v2f64: ; FMA32: ## %bb.0: -; FMA32-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] -; FMA32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] -; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] -; FMA32-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; FMA32-NEXT: vmovaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; FMA32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; FMA32-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] -; FMA32-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; FMA32-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 ; FMA32-NEXT: retl ## encoding: [0xc3] ; ; FMACALL32-LABEL: constant_fold_v2f64: @@ -2233,15 +2203,9 @@ define <2 x double> @constant_fold_v2f64() { ; ; FMA64-LABEL: constant_fold_v2f64: ; FMA64: ## %bb.0: -; FMA64-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] -; FMA64-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] -; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] -; FMA64-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; FMA64-NEXT: vmovaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; FMA64-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] -; FMA64-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; FMA64-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 ; FMA64-NEXT: retq ## encoding: [0xc3] ; ; FMACALL64-LABEL: constant_fold_v2f64: @@ -2253,28 +2217,16 @@ define <2 x double> @constant_fold_v2f64() { ; ; AVX512-LABEL: constant_fold_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] -; AVX512-NEXT: ## encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] -; AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; AVX512VL-LABEL: constant_fold_v2f64: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+1,2.0E+1] -; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0d,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0] -; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [4.1E+2,1.4E+3] +; AVX512VL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: vfmadd231pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb8,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512VL-NEXT: ## xmm0 = (xmm1 * mem) + xmm0 ; AVX512VL-NEXT: retq ## encoding: [0xc3] %r = call <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> , <2 x double> ) ret <2 x double> %r -- GitLab From c7d1163554e36d16530cf64eebe447f3062b24b5 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 18 Oct 2024 12:28:15 +0200 Subject: [PATCH 037/511] [bazel] Port 7be4ab0a86f9a52f1b49dad5665617441ec24a2e --- .../llvm-project-overlay/libc/BUILD.bazel | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 8f3bbe68648f..1abc0ccda4c7 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -103,6 +103,17 @@ libc_support_library( deps = [":llvm_libc_macros_float_macros"], ) +libc_support_library( + name = "llvm_libc_types_cfloat128", + hdrs = ["include/llvm-libc-types/cfloat128.h"], + deps = [":llvm_libc_macros_float_macros"], +) + +libc_support_library( + name = "llvm_libc_types_cfloat16", + hdrs = ["include/llvm-libc-types/cfloat16.h"], +) + libc_support_library( name = "llvm_libc_macros_fcntl_macros", hdrs = ["include/llvm-libc-macros/linux/fcntl-macros.h"], @@ -268,6 +279,16 @@ libc_support_library( hdrs = ["src/__support/macros/properties/os.h"], ) +libc_support_library( + name = "__support_macros_properties_complex_types", + hdrs = ["src/__support/macros/properties/complex_types.h"], + deps = [ + ":__support_macros_properties_types", + ":llvm_libc_types_cfloat128", + ":llvm_libc_types_cfloat16", + ], +) + libc_support_library( name = "__support_macros_properties_types", hdrs = ["src/__support/macros/properties/types.h"], @@ -493,6 +514,7 @@ libc_support_library( deps = [ ":__support_macros_attributes", ":__support_macros_config", + ":__support_macros_properties_complex_types", ":__support_macros_properties_types", ":llvm_libc_macros_stdfix_macros", ], -- GitLab From 8e6abf526e74e9148393dd9ea0c8e91649b2ae49 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 18 Oct 2024 12:30:16 +0200 Subject: [PATCH 038/511] Fix -Wswitch after 508fd966fb00428ccd1dd7ddeb636fb7393029ec --- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 50115a638b95..e710f976ccc4 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -5066,6 +5066,9 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::BuiltinType::SveUint64x3: case clang::BuiltinType::SveUint64x4: case clang::BuiltinType::SveMFloat8: + case clang::BuiltinType::SveMFloat8x2: + case clang::BuiltinType::SveMFloat8x3: + case clang::BuiltinType::SveMFloat8x4: case clang::BuiltinType::SveFloat16: case clang::BuiltinType::SveBFloat16: case clang::BuiltinType::SveBFloat16x2: -- GitLab From 55cbbce0958c8dbd4ae800d16d1d12a31173ace4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Fri, 18 Oct 2024 12:53:47 +0200 Subject: [PATCH 039/511] [clang][ASTImporter] Fix of unchecked Error object (#112688) After commits 9c72a30 and 30a9cac error handling in function 'importTemplateParameterDefaultArgument' was not correct, probably related to (not) using std::move. A crash with unchecked Error result could happen when the import error path was taken. Here a test is added that reproduces this case and the problem is fixed. --- clang/lib/AST/ASTImporter.cpp | 26 +++++++++++------------ clang/unittests/AST/ASTImporterTest.cpp | 28 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 020a2f396b5a..e7a6509167f0 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -362,24 +362,24 @@ namespace clang { template Error importTemplateParameterDefaultArgument(const TemplateParmDeclT *D, TemplateParmDeclT *ToD) { - Error Err = Error::success(); if (D->hasDefaultArgument()) { if (D->defaultArgumentWasInherited()) { - auto *ToInheritedFrom = const_cast( - importChecked(Err, D->getDefaultArgStorage().getInheritedFrom())); - if (Err) - return Err; + Expected ToInheritedFromOrErr = + import(D->getDefaultArgStorage().getInheritedFrom()); + if (!ToInheritedFromOrErr) + return ToInheritedFromOrErr.takeError(); + TemplateParmDeclT *ToInheritedFrom = *ToInheritedFromOrErr; if (!ToInheritedFrom->hasDefaultArgument()) { // Resolve possible circular dependency between default value of the // template argument and the template declaration. - const auto ToInheritedDefaultArg = - importChecked(Err, D->getDefaultArgStorage() - .getInheritedFrom() - ->getDefaultArgument()); - if (Err) - return Err; + Expected ToInheritedDefaultArgOrErr = + import(D->getDefaultArgStorage() + .getInheritedFrom() + ->getDefaultArgument()); + if (!ToInheritedDefaultArgOrErr) + return ToInheritedDefaultArgOrErr.takeError(); ToInheritedFrom->setDefaultArgument(Importer.getToContext(), - ToInheritedDefaultArg); + *ToInheritedDefaultArgOrErr); } ToD->setInheritedDefaultArgument(ToD->getASTContext(), ToInheritedFrom); @@ -395,7 +395,7 @@ namespace clang { *ToDefaultArgOrErr); } } - return Err; + return Error::success(); } public: diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index aacecd3fbcd9..bf7313f882e4 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -9986,6 +9986,34 @@ TEST_P(ImportTemplateParmDeclDefaultValue, InvisibleInheritedFrom) { ToFDef->getTemplateParameters()->getParam(0)); } +TEST_P(ImportTemplateParmDeclDefaultValue, DefValImportError) { + const char *ToCode = + R"( + class X { + int A; + }; + )"; + getToTuDecl(ToCode, Lang_CXX14); + + const char *FromCode = + R"( + class X; + + template + void f() {} + + class X { + char A; + }; + )"; + TranslationUnitDecl *FromTU = getTuDecl(FromCode, Lang_CXX14); + auto *FromF = FirstDeclMatcher().match( + FromTU, functionTemplateDecl(hasName("f"))); + + auto *ToFImported = Import(FromF, Lang_CXX14); + EXPECT_FALSE(ToFImported); +} + TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) { TranslationUnitDecl *FromTU = getTuDecl(CodeFunction, Lang_CXX14); auto *D3 = LastDeclMatcher().match( -- GitLab From e1330d96a086a9a3f6d8f11b8b8e3c2b6c500018 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 11:57:30 +0100 Subject: [PATCH 040/511] [DAG] visitFMA/FDIV - avoid SDLoc duplication. NFC. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ea869371ae10..98eed6b7503d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17398,14 +17398,14 @@ template SDValue DAGCombiner::visitFMA(SDNode *N) { // FIXME: Support splat of constant. if (N0CFP && N0CFP->isExactlyValue(1.0)) - return matcher.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); + return matcher.getNode(ISD::FADD, DL, VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) - return matcher.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); + return matcher.getNode(ISD::FADD, DL, VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && !DAG.isConstantFPBuildVectorOrConstantFP(N1)) - return matcher.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); + return matcher.getNode(ISD::FMA, DL, VT, N1, N0, N2); bool CanReassociate = Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); @@ -17713,7 +17713,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); if (NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper || CostN1 == TargetLowering::NegatibleCost::Cheaper)) - return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1); + return DAG.getNode(ISD::FDIV, DL, VT, NegN0, NegN1); } if (SDValue R = combineFMulOrFDivWithIntPow2(N)) -- GitLab From d5087012498a43ad8345dc75be229e2e924660c2 Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Fri, 18 Oct 2024 16:29:07 +0530 Subject: [PATCH 041/511] [libc][complex] add additonal header guards for CFP16 and CFP128 (#112879) Fixes build errors due to #112594 --- libc/include/llvm-libc-types/cfloat128.h | 23 +++++++++++++---------- libc/include/llvm-libc-types/cfloat16.h | 3 ++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/libc/include/llvm-libc-types/cfloat128.h b/libc/include/llvm-libc-types/cfloat128.h index 0cc8ed3041d6..a371671cf623 100644 --- a/libc/include/llvm-libc-types/cfloat128.h +++ b/libc/include/llvm-libc-types/cfloat128.h @@ -12,21 +12,24 @@ #include "../llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG // Currently, the complex variant of C23 `_Float128` type is only defined as a -// built-in type in GCC 7 or later, and only for C. For C++, or for clang, -// the complex variant of `__float128` is defined instead, and only on x86-64 -// targets. +// built-in type in GCC 7 or later, for C and in GCC 13 or later, for C++. For +// clang, the complex variant of `__float128` is defined instead, and only on +// x86-64 targets for clang 11 or later. // // TODO: Update the complex variant of C23 `_Float128` type detection again when // clang supports it. -// https://github.com/llvm/llvm-project/issues/80195 -#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__) && \ - !defined(__cplusplus) +#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__) +#if !defined(__cplusplus) #define LIBC_TYPES_HAS_CFLOAT128 typedef _Complex _Float128 cfloat128; -#elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__) -// Use _Complex __float128 type. gcc and clang sometime use __SIZEOF_FLOAT128__ -// to notify the availability of __float128. clang also uses __FLOAT128__ macro -// to notify the availability of __float128 type: +#elif defined(__GNUC__) && __GNUC__ >= 13 +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex _Float128 cfloat128; +#endif +#elif __clang_major__ >= 11 && \ + (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) +// Use _Complex __float128 type. clang uses __SIZEOF_FLOAT128__ or __FLOAT128__ +// macro to notify the availability of __float128 type: // https://reviews.llvm.org/D15120 #define LIBC_TYPES_HAS_CFLOAT128 typedef _Complex __float128 cfloat128; diff --git a/libc/include/llvm-libc-types/cfloat16.h b/libc/include/llvm-libc-types/cfloat16.h index e7e5631e0250..2d4cef756272 100644 --- a/libc/include/llvm-libc-types/cfloat16.h +++ b/libc/include/llvm-libc-types/cfloat16.h @@ -10,7 +10,8 @@ #define LLVM_LIBC_TYPES_CFLOAT16_H #if defined(__FLT16_MANT_DIG__) && \ - (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__)) && \ + (!defined(__GNUC__) || __GNUC__ >= 13 || \ + (defined(__clang__) && __clang_major__ >= 14)) && \ !defined(__arm__) && !defined(_M_ARM) && !defined(__riscv) && \ !defined(_WIN32) #define LIBC_TYPES_HAS_CFLOAT16 -- GitLab From 3eaf4a9d1a847a4e03a21365682b3a73d7e2e6d0 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 18 Oct 2024 13:03:13 +0200 Subject: [PATCH 042/511] [clang][bytecode] Check for memory leaks after destroying global scope (#112868) The global scope we create when evaluating expressions might free some of the dynamic memory allocations, so we can't check for memory leaks before destroying it. --- clang/lib/AST/ByteCode/Compiler.cpp | 18 ++++++++++++------ clang/lib/AST/ByteCode/Context.cpp | 3 +-- clang/lib/AST/ByteCode/EvalEmitter.cpp | 17 ----------------- clang/lib/AST/ByteCode/Interp.h | 4 ++++ clang/lib/AST/ByteCode/Opcodes.td | 1 + clang/test/AST/ByteCode/new-delete.cpp | 22 ++++++++++++++++++++++ 6 files changed, 40 insertions(+), 25 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index a71c0dcc9381..672fa7fc25d6 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4132,10 +4132,16 @@ template bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { LocalScope RootScope(this); + // If we won't destroy the toplevel scope, check for memory leaks first. + if (!DestroyToplevelScope) { + if (!this->emitCheckAllocations(E)) + return false; + } + auto maybeDestroyLocals = [&]() -> bool { if (DestroyToplevelScope) - return RootScope.destroyLocals(); - return true; + return RootScope.destroyLocals() && this->emitCheckAllocations(E); + return this->emitCheckAllocations(E); }; // Void expressions. @@ -4171,8 +4177,7 @@ bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { return this->emitRetValue(E) && maybeDestroyLocals(); } - (void)maybeDestroyLocals(); - return false; + return maybeDestroyLocals() && this->emitCheckAllocations(E) && false; } template @@ -4214,7 +4219,8 @@ bool Compiler::visitDeclAndReturn(const VarDecl *VD, DeclScope LS(this, VD); if (!this->visit(VD->getAnyInitializer())) return false; - return this->emitRet(VarT.value_or(PT_Ptr), VD) && LS.destroyLocals(); + return this->emitRet(VarT.value_or(PT_Ptr), VD) && LS.destroyLocals() && + this->emitCheckAllocations(VD); } LocalScope VDScope(this, VD); @@ -4260,7 +4266,7 @@ bool Compiler::visitDeclAndReturn(const VarDecl *VD, return false; } - return VDScope.destroyLocals(); + return VDScope.destroyLocals() && this->emitCheckAllocations(VD); } template diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index 9bca8138cd9f..7088cf02901c 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -78,8 +78,7 @@ bool Context::evaluate(State &Parent, const Expr *E, APValue &Result, Compiler C(*this, *P, Parent, Stk); auto Res = C.interpretExpr(E, /*ConvertResultToRValue=*/false, - /*DestroyToplevelScope=*/Kind == - ConstantExprKind::ClassTemplateArgument); + /*DestroyToplevelScope=*/true); if (Res.isInvalid()) { C.cleanup(); Stk.clearTo(StackSizeBefore); diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index 7eecee25bb3c..65ad960cfa8d 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -132,17 +132,10 @@ bool EvalEmitter::fallthrough(const LabelTy &Label) { return true; } -static bool checkReturnState(InterpState &S) { - return S.maybeDiagnoseDanglingAllocations(); -} - template bool EvalEmitter::emitRet(const SourceInfo &Info) { if (!isActive()) return true; - if (!checkReturnState(S)) - return false; - using T = typename PrimConv::T; EvalResult.setValue(S.Stk.pop().toAPValue(Ctx.getASTContext())); return true; @@ -159,9 +152,6 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { if (CheckFullyInitialized && !EvalResult.checkFullyInitialized(S, Ptr)) return false; - if (!checkReturnState(S)) - return false; - // Implicitly convert lvalue to rvalue, if requested. if (ConvertResultToRValue) { if (!Ptr.isZero() && !Ptr.isDereferencable()) @@ -194,16 +184,12 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { if (!isActive()) return true; - if (!checkReturnState(S)) - return false; // Function pointers cannot be converted to rvalues. EvalResult.setFunctionPointer(S.Stk.pop()); return true; } bool EvalEmitter::emitRetVoid(const SourceInfo &Info) { - if (!checkReturnState(S)) - return false; EvalResult.setValid(); return true; } @@ -216,9 +202,6 @@ bool EvalEmitter::emitRetValue(const SourceInfo &Info) { if (CheckFullyInitialized && !EvalResult.checkFullyInitialized(S, Ptr)) return false; - if (!checkReturnState(S)) - return false; - if (std::optional APV = Ptr.toRValue(S.getASTContext(), EvalResult.getSourceType())) { EvalResult.setValue(*APV); diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index f034bde30903..aafc848a9c53 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -3007,6 +3007,10 @@ static inline bool IsConstantContext(InterpState &S, CodePtr OpPC) { return true; } +static inline bool CheckAllocations(InterpState &S, CodePtr OpPC) { + return S.maybeDiagnoseDanglingAllocations(); +} + /// Check if the initializer and storage types of a placement-new expression /// match. bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E, diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 4fa9b6d61d5a..a1970f25ca97 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -836,3 +836,4 @@ def CheckNewTypeMismatchArray : Opcode { } def IsConstantContext: Opcode; +def CheckAllocations : Opcode; diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 8bcbed1aba21..94fe2d4497df 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -796,6 +796,28 @@ static_assert(virt_delete(false)); // both-error {{not an integral constant expr // both-note {{in call to}} +namespace ToplevelScopeInTemplateArg { + class string { + public: + char *mem; + constexpr string() { + this->mem = new char(1); + } + constexpr ~string() { + delete this->mem; + } + constexpr unsigned size() const { return 4; } + }; + + + template + void test() {}; + + void f() { + test(); + static_assert(string().size() == 4); + } +} #else /// Make sure we reject this prior to C++20 -- GitLab From 332ac18e318ce0b6bf316d7f35d33d8af4c56fc5 Mon Sep 17 00:00:00 2001 From: c8ef Date: Fri, 18 Oct 2024 19:03:50 +0800 Subject: [PATCH 043/511] [clang] constexpr built-in abs function. (#112539) According to [P0533R9](https://wg21.link/P0533R9), the C++ standard library functions corresponding to the C macros in `[c.math.abs]` are now `constexpr`. To implement this feature in libc++, we must make the built-in abs function `constexpr`. This patch adds the implementation of a `constexpr` abs function for the current constant evaluator and the new bytecode interpreter. It is important to note that in 2's complement systems, the absolute value of the most negative value is out of range. In gcc, it will result in an out-of-range error and will not be evaluated as constants. We follow the same approach here. --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/Builtins.td | 1 + clang/lib/AST/ByteCode/InterpBuiltin.cpp | 21 +++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 14 +++++++++++++ clang/test/AST/ByteCode/builtin-functions.cpp | 14 +++++++++++++ clang/test/CodeGenCXX/builtins.cpp | 14 ++++++------- clang/test/Sema/constant-builtins-2.c | 13 +++++++++++- 7 files changed, 69 insertions(+), 9 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a65bd6f38290..b34da2d75570 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -272,6 +272,7 @@ Non-comprehensive list of changes in this release ``__builtin_signbit`` can now be used in constant expressions. - Plugins can now define custom attributes that apply to statements as well as declarations. +- ``__builtin_abs`` function can now be used in constant expressions. New Compiler Flags ------------------ diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 382fb6b7a3c0..90475a361bb8 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -2714,6 +2714,7 @@ def Abs : IntMathTemplate, LibBuiltin<"stdlib.h"> { let Attributes = [NoThrow, Const]; let Prototype = "T(T)"; let AddBuiltinPrefixedAlias = 1; + let OnlyBuiltinPrefixedAliasIsConstexpr = 1; } def Calloc : LibBuiltin<"stdlib.h"> { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 65c7b4e5306d..d4a8e6c2035e 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -563,6 +563,20 @@ static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_abs(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, const Function *Func, + const CallExpr *Call) { + PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); + APSInt Val = peekToAPSInt(S.Stk, ArgT); + if (Val == + APSInt(APInt::getSignedMinValue(Val.getBitWidth()), /*IsUnsigned=*/false)) + return false; + if (Val.isNegative()) + Val.negate(); + pushInteger(S, Val, Call->getType()); + return true; +} + static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, @@ -1808,6 +1822,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case Builtin::BI__builtin_abs: + case Builtin::BI__builtin_labs: + case Builtin::BI__builtin_llabs: + if (!interp__builtin_abs(S, OpPC, Frame, F, Call)) + return false; + break; + case Builtin::BI__builtin_popcount: case Builtin::BI__builtin_popcountl: case Builtin::BI__builtin_popcountll: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 8544052d5e49..8e36cad2d2c6 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13098,6 +13098,20 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, return Success(Val.popcount() % 2, E); } + case Builtin::BI__builtin_abs: + case Builtin::BI__builtin_labs: + case Builtin::BI__builtin_llabs: { + APSInt Val; + if (!EvaluateInteger(E->getArg(0), Val, Info)) + return false; + if (Val == APSInt(APInt::getSignedMinValue(Val.getBitWidth()), + /*IsUnsigned=*/false)) + return false; + if (Val.isNegative()) + Val.negate(); + return Success(Val, E); + } + case Builtin::BI__builtin_popcount: case Builtin::BI__builtin_popcountl: case Builtin::BI__builtin_popcountll: diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 450ff5671314..b5d334178f82 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -265,6 +265,20 @@ namespace fpclassify { char classify_subnorm [__builtin_fpclassify(-1, -1, -1, +1, -1, 1.0e-38f)]; } +namespace abs { + static_assert(__builtin_abs(14) == 14, ""); + static_assert(__builtin_labs(14L) == 14L, ""); + static_assert(__builtin_llabs(14LL) == 14LL, ""); + static_assert(__builtin_abs(-14) == 14, ""); + static_assert(__builtin_labs(-0x14L) == 0x14L, ""); + static_assert(__builtin_llabs(-0x141414141414LL) == 0x141414141414LL, ""); +#define BITSIZE(x) (sizeof(x) * 8) + constexpr int abs4 = __builtin_abs(1 << (BITSIZE(int) - 1)); // both-error {{must be initialized by a constant expression}} + constexpr long abs6 = __builtin_labs(1L << (BITSIZE(long) - 1)); // both-error {{must be initialized by a constant expression}} + constexpr long long abs8 = __builtin_llabs(1LL << (BITSIZE(long long) - 1)); // both-error {{must be initialized by a constant expression}} +#undef BITSIZE +} // namespace abs + namespace fabs { static_assert(__builtin_fabs(-14.0) == 14.0, ""); } diff --git a/clang/test/CodeGenCXX/builtins.cpp b/clang/test/CodeGenCXX/builtins.cpp index 90265186fb3d..37f9491d12d0 100644 --- a/clang/test/CodeGenCXX/builtins.cpp +++ b/clang/test/CodeGenCXX/builtins.cpp @@ -14,6 +14,12 @@ int o = X::__builtin_fabs(-2.0); long p = X::__builtin_fabsf(-3.0f); // CHECK: @p ={{.*}} global i64 3, align 8 +int x = __builtin_abs(-2); +// CHECK: @x ={{.*}} global i32 2, align 4 + +long y = __builtin_abs(-2l); +// CHECK: @y ={{.*}} global i64 2, align 8 + // PR8839 extern "C" char memmove(); @@ -52,14 +58,6 @@ extern "C" int __builtin_abs(int); // #1 long __builtin_abs(long); // #2 extern "C" int __builtin_abs(int); // #3 -int x = __builtin_abs(-2); -// CHECK: [[X:%.+]] = call i32 @llvm.abs.i32(i32 -2, i1 true) -// CHECK-NEXT: store i32 [[X]], ptr @x, align 4 - -long y = __builtin_abs(-2l); -// CHECK: [[Y:%.+]] = call noundef i64 @_Z13__builtin_absl(i64 noundef -2) -// CHECK: store i64 [[Y]], ptr @y, align 8 - extern const char char_memchr_arg[32]; char *memchr_result = __builtin_char_memchr(char_memchr_arg, 123, 32); // CHECK: call ptr @memchr(ptr noundef @char_memchr_arg, i32 noundef 123, i64 noundef 32) diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c index da2264500d76..e465a3c5f0ad 100644 --- a/clang/test/Sema/constant-builtins-2.c +++ b/clang/test/Sema/constant-builtins-2.c @@ -35,7 +35,7 @@ long double g11 = __builtin_nansl(""); __float128 g11_2 = __builtin_nansf128(""); #endif -//int g12 = __builtin_abs(-12); +int g12 = __builtin_abs(-12); double g13 = __builtin_fabs(-12.); double g13_0 = __builtin_fabs(-0.); @@ -456,6 +456,17 @@ char clrsb9[__builtin_clrsb(1 << (BITSIZE(int) - 1)) == 0 ? 1 : -1]; char clrsb10[__builtin_clrsb(~(1 << (BITSIZE(int) - 1))) == 0 ? 1 : -1]; char clrsb11[__builtin_clrsb(0xf) == BITSIZE(int) - 5 ? 1 : -1]; char clrsb12[__builtin_clrsb(~0x1f) == BITSIZE(int) - 6 ? 1 : -1]; + +char abs1[__builtin_abs(-12)]; +char abs2[__builtin_labs(-12L)]; +char abs3[__builtin_llabs(-12LL)]; +int abs4 = __builtin_abs(1 << (BITSIZE(int) - 1)); // expected-error {{not a compile-time constant}} +char abs5[__builtin_abs((1 << (BITSIZE(int) - 1)) + 1)]; +long abs6 = __builtin_labs(1L << (BITSIZE(long) - 1)); // expected-error {{not a compile-time constant}} +long abs7 = __builtin_labs((1L << (BITSIZE(long) - 1)) + 1); +long long abs8 = __builtin_llabs(1LL << (BITSIZE(long long) - 1)); // expected-error {{not a compile-time constant}} +long long abs9 = __builtin_llabs((1LL << (BITSIZE(long long) - 1)) + 1); + #undef BITSIZE // GCC misc stuff -- GitLab From 1a871b2122470491f73b51f3e57718bc3bda08f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 18 Oct 2024 12:20:17 +0100 Subject: [PATCH 044/511] [mlir][tensor] Add tests to invalid.mlir (nfc) (#112759) Adds two test with invalid usage of `tensor.extract_slice` that were missing. Also moves one other test for `tensor.extract_slice`, so that all tests for this Op are clustered together. Note, this PR merely documents the current behaviour. No new functionality is added. --- mlir/test/Dialect/Tensor/invalid.mlir | 35 +++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index 84e6c59e403d..921d7f9f1fef 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -200,7 +200,6 @@ func.func @tensor.reshape_num_elements_mismatch( func.func @extract_slice_wrong_result_rank(%t: tensor, %idx : index) { // expected-error @+1 {{expected rank to be smaller or equal to the other rank.}} %0 = tensor.extract_slice %t[0][4][1] : tensor to tensor - return } @@ -209,7 +208,25 @@ func.func @extract_slice_wrong_result_rank(%t: tensor, %idx : index) { func.func @extract_slice_wrong_result_rank(%t: tensor, %idx : index) { // expected-error @+1 {{expected element type to be 'f32'}} %0 = tensor.extract_slice %t[0][4][1] : tensor to tensor<4xi8> + return +} + + +// ----- + +func.func @extract_slice_size_and_output_dim_mismatch_static_size(%t: tensor<16xf32>) { + // expected-error @+1 {{expected type to be 'tensor<4xf32>' or a rank-reduced version. (size mismatch)}} + %0 = tensor.extract_slice %t[0][4][1] + : tensor<16xf32> to tensor<6xf32> + return +} + +// ----- +func.func @extract_slice_size_and_output_dim_mismatch_dynamic_size(%t: tensor, %idx : index) { + // expected-error @+2 {{expected type to be 'tensor' or a rank-reduced version. (size mismatch)}} + %c4 = arith.constant 4 : index + %0 = tensor.extract_slice %t[0][%c4][1] : tensor to tensor<4xi8> return } @@ -219,7 +236,6 @@ func.func @extract_slice_wrong_static_type(%t: tensor<8x16x4xf32>, %idx : index) // expected-error @+1 {{expected type to be 'tensor' or a rank-reduced version. (size mismatch)}} %0 = tensor.extract_slice %t[0, 0, 0][%idx, 4, 4][1, 1, 1] : tensor<8x16x4xf32> to tensor<4x4x4xf32> - return } @@ -229,7 +245,14 @@ func.func @extract_slice_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index // expected-error @+1 {{expected type to be 'tensor<4x4x4xf32>' or a rank-reduced version. (size mismatch)}} %0 = tensor.extract_slice %t[0, 2, 0][4, 4, 4][1, 1, 1] : tensor<8x16x4xf32> to tensor + return +} +// ----- + +func.func @illegal_num_offsets(%arg0 : tensor, %arg1 : index, %arg2 : index) { + // expected-error@+1 {{expected 3 offset values}} + %0 = tensor.extract_slice %arg0[0, 0] [%arg1, %arg2] [1, 1] : tensor to tensor return } @@ -349,14 +372,6 @@ func.func @rank(%0: f32) { // ----- -func.func @illegal_num_offsets(%arg0 : tensor, %arg1 : index, %arg2 : index) { - // expected-error@+1 {{expected 3 offset values}} - %0 = tensor.extract_slice %arg0[0, 0] [%arg1, %arg2] [1, 1] : tensor to tensor - return -} - -// ----- - func.func @illegal_num_offsets(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index) { // expected-error@+1 {{expected 3 offset values}} -- GitLab From 4995d093555f00728e20f4e4095cd182f748cec7 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Fri, 18 Oct 2024 13:51:20 +0200 Subject: [PATCH 045/511] [analyzer][Solver] Improve getSymVal and friends (1/2) (#112583) --- .../Checkers/BitwiseShiftChecker.cpp | 3 +- .../Core/RangeConstraintManager.cpp | 21 +++++--------- clang/test/Analysis/infeasible-sink.c | 29 +++---------------- 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp index 339927c165fe..17f1214195b3 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp @@ -177,7 +177,8 @@ BugReportPtr BitwiseShiftValidator::checkOvershift() { RightOpStr = formatv(" '{0}'", ConcreteRight->getValue()); else { SValBuilder &SVB = Ctx.getSValBuilder(); - if (const llvm::APSInt *MinRight = SVB.getMinValue(FoldedState, Right)) { + if (const llvm::APSInt *MinRight = SVB.getMinValue(FoldedState, Right); + MinRight && *MinRight >= LHSBitWidth) { LowerBoundStr = formatv(" >= {0},", MinRight->getExtValue()); } } diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp index 70d5a6096817..ecf7974c8386 100644 --- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp @@ -1939,11 +1939,8 @@ private: RangeSet::Factory F; RangeSet getRange(ProgramStateRef State, SymbolRef Sym); - RangeSet getRange(ProgramStateRef State, EquivalenceClass Class); ProgramStateRef setRange(ProgramStateRef State, SymbolRef Sym, RangeSet Range); - ProgramStateRef setRange(ProgramStateRef State, EquivalenceClass Class, - RangeSet Range); RangeSet getSymLTRange(ProgramStateRef St, SymbolRef Sym, const llvm::APSInt &Int, @@ -2866,24 +2863,22 @@ ConditionTruthVal RangeConstraintManager::checkNull(ProgramStateRef State, const llvm::APSInt *RangeConstraintManager::getSymVal(ProgramStateRef St, SymbolRef Sym) const { - const RangeSet *T = getConstraint(St, Sym); - return T ? T->getConcreteValue() : nullptr; + auto &MutableSelf = const_cast(*this); + return MutableSelf.getRange(St, Sym).getConcreteValue(); } const llvm::APSInt *RangeConstraintManager::getSymMinVal(ProgramStateRef St, SymbolRef Sym) const { - const RangeSet *T = getConstraint(St, Sym); - if (!T || T->isEmpty()) - return nullptr; - return &T->getMinValue(); + auto &MutableSelf = const_cast(*this); + RangeSet Range = MutableSelf.getRange(St, Sym); + return Range.isEmpty() ? nullptr : &Range.getMinValue(); } const llvm::APSInt *RangeConstraintManager::getSymMaxVal(ProgramStateRef St, SymbolRef Sym) const { - const RangeSet *T = getConstraint(St, Sym); - if (!T || T->isEmpty()) - return nullptr; - return &T->getMaxValue(); + auto &MutableSelf = const_cast(*this); + RangeSet Range = MutableSelf.getRange(St, Sym); + return Range.isEmpty() ? nullptr : &Range.getMaxValue(); } //===----------------------------------------------------------------------===// diff --git a/clang/test/Analysis/infeasible-sink.c b/clang/test/Analysis/infeasible-sink.c index 9cb66fcac0b6..a88ca42f27e4 100644 --- a/clang/test/Analysis/infeasible-sink.c +++ b/clang/test/Analysis/infeasible-sink.c @@ -38,7 +38,7 @@ void test1(int x) { } int a, b, c, d, e; -void test2() { +void test2(void) { if (a == 0) return; @@ -50,31 +50,10 @@ void test2() { b = d; a -= d; - if (a != 0) - return; - - clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}} + clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} - /* The BASELINE passes these checks ('wrning' is used to avoid lit to match) - // The parent state is already infeasible, look at this contradiction: - clang_analyzer_eval(b > 0); // expected-wrning{{FALSE}} - clang_analyzer_eval(b <= 0); // expected-wrning{{FALSE}} - // Crashes with expensive checks. - if (b > 0) { - clang_analyzer_warnIfReached(); // no-warning, OK + if (a != 0) return; - } - // Should not be reachable. - clang_analyzer_warnIfReached(); // expected-wrning{{REACHABLE}} - */ - // The parent state is already infeasible, but we realize that only if b is - // constrained. - clang_analyzer_eval(b > 0); // expected-warning{{UNKNOWN}} - clang_analyzer_eval(b <= 0); // expected-warning{{UNKNOWN}} - if (b > 0) { - clang_analyzer_warnIfReached(); // no-warning - return; - } - clang_analyzer_warnIfReached(); // no-warning + clang_analyzer_warnIfReached(); // no-warning: Unreachable due to contradiction. } -- GitLab From 7be1dc0f32f43331c049725e0e2b902e74115779 Mon Sep 17 00:00:00 2001 From: Nicholas Guy <67685292+NickGuy-Arm@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:08:24 +0100 Subject: [PATCH 046/511] [PassRegistry] Add complex deinterleaving pass to PassRegistry.def (#112874) Allow for the complex deinterleaving pass to be invoked via `opt --passes=complex-deinterleaving` --- llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index ebad3507eb5e..60ab33bee704 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -80,6 +80,7 @@ #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" #include "llvm/CodeGen/CallBrPrepare.h" #include "llvm/CodeGen/CodeGenPrepare.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/DwarfEHPrepare.h" #include "llvm/CodeGen/EarlyIfConversion.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 90859c18c4f4..549c1359b585 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -339,6 +339,7 @@ FUNCTION_PASS("callbr-prepare", CallBrPreparePass()) FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("chr", ControlHeightReductionPass()) FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM)) +FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(TM)) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass()) FUNCTION_PASS("coro-elide", CoroElidePass()) -- GitLab From 852e4779ba39732d63df60e23cf33abc6987b8e8 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 18 Oct 2024 07:17:01 -0500 Subject: [PATCH 047/511] =?UTF-8?q?[flang][OpenMP]=20Add=20`Id`=20function?= =?UTF-8?q?=20to=20`OmpClause`=20to=20return=20clause=20id,=E2=80=A6=20(#1?= =?UTF-8?q?12712)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … NFC This replaces the two instances of `GetClauseKindForParserClass` with a localized member function. --- flang/include/flang/Parser/parse-tree.h | 2 ++ flang/lib/Lower/OpenMP/Clauses.cpp | 23 +-------------------- flang/lib/Parser/parse-tree.cpp | 18 ++++++++++++++++ flang/lib/Semantics/check-omp-structure.cpp | 9 +++----- flang/lib/Semantics/check-omp-structure.h | 7 ------- 5 files changed, 24 insertions(+), 35 deletions(-) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 21b4a344dbc4..4a3c992c4ec5 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -26,6 +26,7 @@ #include "flang/Common/idioms.h" #include "flang/Common/indirection.h" #include "llvm/Frontend/OpenACC/ACC.h.inc" +#include "llvm/Frontend/OpenMP/OMP.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include #include @@ -3660,6 +3661,7 @@ struct OmpLastprivateClause { // OpenMP Clauses struct OmpClause { UNION_CLASS_BOILERPLATE(OmpClause); + llvm::omp::Clause Id() const; #define GEN_FLANG_CLAUSE_PARSER_CLASSES #include "llvm/Frontend/OpenMP/OMP.inc" diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 812551de6857..64d661256a18 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -22,26 +22,6 @@ #include #include -namespace detail { -template -llvm::omp::Clause getClauseIdForClass(C &&) { - using namespace Fortran; - using A = llvm::remove_cvref_t; // A is referenced in OMP.inc - // The code included below contains a sequence of checks like the following - // for each OpenMP clause - // if constexpr (std::is_same_v) - // return llvm::omp::Clause::OMPC_acq_rel; - // [...] -#define GEN_FLANG_CLAUSE_PARSER_KIND_MAP -#include "llvm/Frontend/OpenMP/OMP.inc" -} -} // namespace detail - -static llvm::omp::Clause getClauseId(const Fortran::parser::OmpClause &clause) { - return Fortran::common::visit( - [](auto &&s) { return detail::getClauseIdForClass(s); }, clause.u); -} - namespace Fortran::lower::omp { using SymbolWithDesignator = std::tuple; @@ -1253,8 +1233,7 @@ Clause makeClause(const parser::OmpClause &cls, semantics::SemanticsContext &semaCtx) { return Fortran::common::visit( [&](auto &&s) { - return makeClause(getClauseId(cls), clause::make(s, semaCtx), - cls.source); + return makeClause(cls.Id(), clause::make(s, semaCtx), cls.source); }, cls.u); } diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp index 7f0899aaa142..948ad04a091a 100644 --- a/flang/lib/Parser/parse-tree.cpp +++ b/flang/lib/Parser/parse-tree.cpp @@ -253,3 +253,21 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Name &x) { return os << x.ToString(); } } // namespace Fortran::parser + +template static llvm::omp::Clause getClauseIdForClass(C &&) { + using namespace Fortran; + using A = llvm::remove_cvref_t; // A is referenced in OMP.inc + // The code included below contains a sequence of checks like the following + // for each OpenMP clause + // if constexpr (std::is_same_v) + // return llvm::omp::Clause::OMPC_acq_rel; + // [...] +#define GEN_FLANG_CLAUSE_PARSER_KIND_MAP +#include "llvm/Frontend/OpenMP/OMP.inc" +} + +namespace Fortran::parser { +llvm::omp::Clause OmpClause::Id() const { + return std::visit([](auto &&s) { return getClauseIdForClass(s); }, u); +} +} // namespace Fortran::parser diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 473ed2be3dbc..461a99f59e4c 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2363,11 +2363,8 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) { void OmpStructureChecker::Enter(const parser::OmpClause &x) { SetContextClause(x); - llvm::omp::Clause clauseId = std::visit( - [this](auto &&s) { return GetClauseKindForParserClass(s); }, x.u); - // The visitors for these clauses do their own checks. - switch (clauseId) { + switch (x.Id()) { case llvm::omp::Clause::OMPC_copyprivate: case llvm::omp::Clause::OMPC_enter: case llvm::omp::Clause::OMPC_lastprivate: @@ -3244,7 +3241,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) { DirectivesClauseTriple dirClauseTriple; SymbolSourceMap currSymbols; GetSymbolsInObjectList(objectList, currSymbols); - CheckDefinableObjects(currSymbols, GetClauseKindForParserClass(x)); + CheckDefinableObjects(currSymbols, llvm::omp::Clause::OMPC_lastprivate); CheckCopyingPolymorphicAllocatable( currSymbols, llvm::omp::Clause::OMPC_lastprivate); @@ -3257,7 +3254,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) { llvm::omp::Directive::OMPD_parallel, llvm::omp::privateReductionSet)); CheckPrivateSymbolsInOuterCxt( - currSymbols, dirClauseTriple, GetClauseKindForParserClass(x)); + currSymbols, dirClauseTriple, llvm::omp::Clause::OMPC_lastprivate); using LastprivateModifier = parser::OmpLastprivateClause::LastprivateModifier; const auto &maybeMod{std::get>(x.v.t)}; diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index cce9fa4e3016..70a7779ae1fa 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -132,13 +132,6 @@ public: #define GEN_FLANG_CLAUSE_CHECK_ENTER #include "llvm/Frontend/OpenMP/OMP.inc" - // Get the OpenMP Clause Kind for the corresponding Parser class - template - llvm::omp::Clause GetClauseKindForParserClass(const A &) { -#define GEN_FLANG_CLAUSE_PARSER_KIND_MAP -#include "llvm/Frontend/OpenMP/OMP.inc" - } - private: bool CheckAllowedClause(llvmOmpClause clause); bool IsVariableListItem(const Symbol &sym); -- GitLab From fdd7c0353fa2d48239e5ac0c1cafb7f31fca4206 Mon Sep 17 00:00:00 2001 From: OverMighty Date: Fri, 18 Oct 2024 14:22:45 +0200 Subject: [PATCH 048/511] [libc][math][c23] Add tanhf16 C23 math function (#106006) Part of #95250. --- libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 23 ++++ libc/src/math/generic/tanhf16.cpp | 144 ++++++++++++++++++++++ libc/src/math/tanhf16.h | 21 ++++ libc/test/src/math/CMakeLists.txt | 11 ++ libc/test/src/math/smoke/CMakeLists.txt | 13 ++ libc/test/src/math/smoke/tanhf16_test.cpp | 143 +++++++++++++++++++++ libc/test/src/math/tanhf16_test.cpp | 40 ++++++ 12 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 libc/src/math/generic/tanhf16.cpp create mode 100644 libc/src/math/tanhf16.h create mode 100644 libc/test/src/math/smoke/tanhf16_test.cpp create mode 100644 libc/test/src/math/tanhf16_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 4bb81f5d3b2d..d89093b2117c 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -587,6 +587,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 libc.src.math.sinhf16 + libc.src.math.tanhf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 39f451d6b5fc..7314dbc660f3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -681,6 +681,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.setpayloadsigf16 libc.src.math.sinhf16 libc.src.math.sinpif16 + libc.src.math.tanhf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 902645c9e001..010377a90f6e 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -348,7 +348,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | tan | |check| | |check| | | | | 7.12.4.7 | F.10.1.7 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| tanh | |check| | | | | | 7.12.5.6 | F.10.2.6 | +| tanh | |check| | | | |check| | | 7.12.5.6 | F.10.2.6 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | tanpi | | | | | | 7.12.4.14 | F.10.1.14 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index e4e46e7e13a5..196dab9f81b3 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -798,6 +798,7 @@ def StdC : StandardSpec<"stdc"> { GuardedFunctionSpec<"sinhf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"tanhf", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"tanhf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"acosf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 2f76b57d19e9..8427b550ab4c 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -496,6 +496,7 @@ add_math_entrypoint_object(tanf) add_math_entrypoint_object(tanh) add_math_entrypoint_object(tanhf) +add_math_entrypoint_object(tanhf16) add_math_entrypoint_object(tgamma) add_math_entrypoint_object(tgammaf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 4a3de8f0400d..81b3e44db792 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4288,6 +4288,29 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + tanhf16 + SRCS + tanhf16.cpp + HDRS + ../tanhf16.h + DEPENDS + .expxf16 + libc.hdr.fenv_macros + libc.src.__support.CPP.array + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( acoshf SRCS diff --git a/libc/src/math/generic/tanhf16.cpp b/libc/src/math/generic/tanhf16.cpp new file mode 100644 index 000000000000..ae9b4be46f7c --- /dev/null +++ b/libc/src/math/generic/tanhf16.cpp @@ -0,0 +1,144 @@ +//===-- Half-precision tanh(x) function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanhf16.h" +#include "expxf16.h" +#include "hdr/fenv_macros.h" +#include "src/__support/CPP/array.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +static constexpr fputil::ExceptValues TANHF16_EXCEPTS = {{ + // x = 0x1.f54p+0, tanhf16(x) = 0x1.ecp-1 (RZ) + {0x3fd5U, 0x3bb0U, 1U, 0U, 0U}, + // x = -0x1.f54p+0, tanhf16(x) = -0x1.ecp-1 (RZ) + {0xbfd5U, 0xbbb0U, 0U, 1U, 0U}, +}}; + +LLVM_LIBC_FUNCTION(float16, tanhf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When -2^(-14) <= x <= -2^(-9), or |x| <= 0x1.d2p-4, + // or |x| >= atanh(1 - 2^(-11)), or x is NaN. + if (LIBC_UNLIKELY(x_abs <= 0x2f48U || x_abs >= 0x4429U)) { + // tanh(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // When -2^(-14) <= x <= -2^(-9). + if (x_u >= 0x8400U && x_u <= 0x9800U) { + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_DOWNWARD: + return x; + default: + return FPBits(static_cast(x_u - 1U)).get_val(); + } + } + + // When |x| <= 0x1.d2p-4. + if (x_abs <= 0x2f48U) { + float xf = x; + float xf_sq = xf * xf; + // Degree-7 Taylor expansion generated by Sollya with the following + // commands: + // > taylor(tanh(x), 7, 0); + // > display = hexadecimal; + // > // For each coefficient: + // > round(/* put coefficient here */, SG, RN); + return fputil::cast( + xf * fputil::polyeval(xf_sq, 0x1p+0f, -0x1.555556p-2f, 0x1.111112p-3f, + -0x1.ba1ba2p-5f)); + } + + // tanh(+/-inf) = +/-1 + if (x_bits.is_inf()) + return FPBits::one(x_bits.sign()).get_val(); + + // When |x| >= atanh(1 - 2^(-11)). + fputil::raise_except_if_required(FE_INEXACT); + + int rounding_mode = fputil::quick_get_round(); + if ((rounding_mode == FE_TONEAREST && x_abs >= 0x4482U) || + (rounding_mode == FE_UPWARD && x_bits.is_pos()) || + (rounding_mode == FE_DOWNWARD && x_bits.is_neg())) { + return FPBits::one(x_bits.sign()).get_val(); + } + if (x_bits.is_pos()) + return fputil::cast(0x1.ffcp-1); + return fputil::cast(-0x1.ffcp-1); + } + + if (auto r = TANHF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // For atanh(-1 + 2^(-11)) < x < atanh(1 - 2^(-11)), to compute tanh(x), we + // perform the following range reduction: find hi, mid, lo, such that: + // x = (hi + mid) * log(2) * 0.5 + lo, in which + // hi is an integer, + // mid * 2^5 is an integer, + // -2^(-5) <= lo < 2^(-5). + // In particular, + // hi + mid = round(x * log2(e) * 2 * 2^5) * 2^(-5). + // Then, + // tanh(x) = sinh(x)/cosh(x) + // = (e^x - e^(-x)) / (e^x + e^(-x)) + // = (e^(2x) - 1) / (e^(2x) + 1) + // = (2^(hi + mid) * e^(2*lo) - 1) / (2^(hi + mid) * e^(2*lo) + 1) + // = (e^(2*lo) - 2^(-hi - mid)) / (e^(2*lo) + 2^(-hi - mid)) + // We store 2^(-mid) in the lookup table EXP2_MID_5_BITS, and compute + // 2^(-hi - mid) by adding -hi to the exponent field of 2^(-mid). + // e^lo is computed using a degree-3 minimax polynomial generated by Sollya. + + float xf = x; + float kf = fputil::nearest_integer(xf * (LOG2F_E * 2.0f * 0x1.0p+5f)); + int x_hi_mid = -static_cast(kf); + unsigned x_hi = static_cast(x_hi_mid) >> 5; + unsigned x_mid = static_cast(x_hi_mid) & 0x1f; + // lo = x - (hi + mid) + // = round(x * log2(e) * 2 * 2^5) * log(2) * 0.5 * (-2^(-5)) + x + float lo = fputil::multiply_add(kf, LOGF_2 * 0.5f * -0x1.0p-5f, xf); + + uint32_t exp2_hi_mid_bits = + EXP2_MID_5_BITS[x_mid] + + static_cast(x_hi << fputil::FPBits::FRACTION_LEN); + // exp2_hi_mid = 2^(-hi - mid) + float exp2_hi_mid = fputil::FPBits(exp2_hi_mid_bits).get_val(); + // Degree-3 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax(expm1(2*x)/x, 2, [|SG...|], [-2^-5, 2^-5]); + // > 1 + x * P; + float exp_2lo = + fputil::polyeval(lo, 0x1p+0f, 0x1p+1f, 0x1.001p+1f, 0x1.555ddep+0f); + return fputil::cast((exp_2lo - exp2_hi_mid) / + (exp_2lo + exp2_hi_mid)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/tanhf16.h b/libc/src/math/tanhf16.h new file mode 100644 index 000000000000..67498708fc46 --- /dev/null +++ b/libc/src/math/tanhf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for tanhf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_TANHF16_H +#define LLVM_LIBC_SRC_MATH_TANHF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 tanhf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_TANHF16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 381a3f478f37..11342e6dfa04 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1966,6 +1966,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanhf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + tanhf16_test.cpp + DEPENDS + libc.src.math.tanhf16 +) + add_fp_unittest( atanhf_test NEED_MPFR diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index f713430ee27c..899c9d2df453 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3778,6 +3778,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanhf16_test + SUITE + libc-math-smoke-tests + SRCS + tanhf16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.tanhf16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( atanhf_test SUITE diff --git a/libc/test/src/math/smoke/tanhf16_test.cpp b/libc/test/src/math/smoke/tanhf16_test.cpp new file mode 100644 index 000000000000..fa6328e9ef0a --- /dev/null +++ b/libc/test/src/math/smoke/tanhf16_test.cpp @@ -0,0 +1,143 @@ +//===-- Unittests for tanhf16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/tanhf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tanhf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(1.0), + LIBC_NAMESPACE::tanhf16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::tanhf16(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::tanhf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::tanhf16(neg_zero)); + EXPECT_MATH_ERRNO(0); +} + +TEST_F(LlvmLibcTanhf16Test, ResultNearBounds) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(1.0), + LIBC_NAMESPACE::tanhf16(max_normal), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), + LIBC_NAMESPACE::tanhf16(neg_max_normal), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + // round(atanh(1 - 2^-11), HP, RU); + float16 x = LIBC_NAMESPACE::fputil::cast(0x1.0a4p+2); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + x = LIBC_NAMESPACE::fputil::cast(0x1.208p+2); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + // round(atanh(-1 + 2^-11), HP, RD); + x = LIBC_NAMESPACE::fputil::cast(-0x1.0a4p+2); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + x = LIBC_NAMESPACE::fputil::cast(-0x1.208p+2); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST( + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD( + LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::tanhf16(x), + FE_INEXACT); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO( + LIBC_NAMESPACE::fputil::cast(-0x1.ffcp-1), + LIBC_NAMESPACE::tanhf16(x), FE_INEXACT); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/tanhf16_test.cpp b/libc/test/src/math/tanhf16_test.cpp new file mode 100644 index 000000000000..7124a83f3d7b --- /dev/null +++ b/libc/test/src/math/tanhf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for tanhf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanhf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcTanhf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x, + LIBC_NAMESPACE::tanhf16(x), 0.5); + } +} + +TEST_F(LlvmLibcTanhf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x, + LIBC_NAMESPACE::tanhf16(x), 0.5); + } +} -- GitLab From 3a30955cb7166c23f041877ed56a60fb0aed80cd Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 18 Oct 2024 08:32:47 -0400 Subject: [PATCH 049/511] [libc++] Remove obsolete _LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF (#112745) I don't know if that macro was ever truly defined by Clang, however it's not anymore, so that is effectively dead code. --- libcxx/include/__memory/addressof.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libcxx/include/__memory/addressof.h b/libcxx/include/__memory/addressof.h index ecb68e0fe61e..98b08958a6a9 100644 --- a/libcxx/include/__memory/addressof.h +++ b/libcxx/include/__memory/addressof.h @@ -23,11 +23,9 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_NO_CFI _LIBCPP_HIDE_FROM_ABI _Tp* a return __builtin_addressof(__x); } -#if _LIBCPP_HAS_OBJC_ARC && !defined(_LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF) +#if _LIBCPP_HAS_OBJC_ARC // Objective-C++ Automatic Reference Counting uses qualified pointers -// that require special addressof() signatures. When -// _LIBCPP_PREDEFINED_OBJC_ARC_ADDRESSOF is defined, the compiler -// itself is providing these definitions. Otherwise, we provide them. +// that require special addressof() signatures. template inline _LIBCPP_HIDE_FROM_ABI __strong _Tp* addressof(__strong _Tp& __x) _NOEXCEPT { return &__x; -- GitLab From ce4618a9c405bd8a9c1e096eb45e9ca83d3891f1 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Fri, 18 Oct 2024 21:42:03 +0900 Subject: [PATCH 050/511] [ASan][windows] Recognize movzx r11d, BYTE PTR [rdx] in interception_win (#111638) The instruction is present in some library in the 24H2 update for Windows 11: ==8508==interception_win: unhandled instruction at 0x7ff83e193a40: 44 0f b6 1a 4c 8b d2 48 This could be generalized, but getting all the ModR/M byte combinations right is tricky. Many other classes of instructions handled in this file could use some generalization too. --- compiler-rt/lib/interception/interception_win.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index 4a6ff6656edb..4f60d4251303 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -768,6 +768,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { } switch (*(u32*)(address)) { + case 0x1ab60f44: // 44 0f b6 1a : movzx r11d, BYTE PTR [rdx] case 0x24448b48: // 48 8b 44 24 XX : mov rax, QWORD ptr [rsp + XX] case 0x246c8948: // 48 89 6C 24 XX : mov QWORD ptr [rsp + XX], rbp case 0x245c8948: // 48 89 5c 24 XX : mov QWORD PTR [rsp + XX], rbx -- GitLab From d7ae43e41196b59a519aef5c80f5236172a0ede3 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 09:00:32 -0400 Subject: [PATCH 051/511] [NFC][X86] Fix Werror=extra error due to enum (#112812) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warnings: llvm-project/llvm/lib/Target/X86/X86FastISel.cpp: In member function ‘virtual bool {anonymous}::X86FastISel::fastLowerCall(llvm::FastISel::CallLoweringInfo&)’: llvm-project/llvm/lib/Target/X86/X86FastISel.cpp:3547: error: enumerated and non-enumerated type in conditional expression [-Werror=extra] 3547 | MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0); --- llvm/lib/Target/X86/X86FastISel.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 4bf660b5e234..9e4e5547c642 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3544,7 +3544,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc)); if (NeedLoad) - MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0); + MIB.addReg(Is64Bit ? X86::RIP : X86::NoRegister).addImm(1).addReg(0); if (Symbol) MIB.addSym(Symbol, OpFlags); else diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ff66eb15508c..9d143256de1e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29986,7 +29986,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts); AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT); // Perform the actual shift. - unsigned LogicalOpc = Opc == ISD::SRA ? ISD::SRL : Opc; + unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc; SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide); // Now we need to construct a mask which will "drop" bits that get // shifted past the LSB/MSB. For a logical shift left, it will look -- GitLab From c742a5dc2e67e1f0020a1fd9f602b369b740eafc Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 09:01:01 -0400 Subject: [PATCH 052/511] [NFC] Fix multi-character character constant warning (#112809) This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warning: In file included from llvm-project/llvm/lib/BinaryFormat/Minidump.cpp:9: llvm-project/llvm/include/llvm/BinaryFormat/Minidump.h:250:37: error: multi-character character constant [-Werror=multichar] 250 | static const uint32_t LLDB_FLAG = 'LLDB'; --- llvm/include/llvm/BinaryFormat/Minidump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/BinaryFormat/Minidump.h b/llvm/include/llvm/BinaryFormat/Minidump.h index addff4298235..03497d4c5fa6 100644 --- a/llvm/include/llvm/BinaryFormat/Minidump.h +++ b/llvm/include/llvm/BinaryFormat/Minidump.h @@ -247,7 +247,7 @@ static_assert(sizeof(Thread) == 48); struct Exception { static constexpr size_t MaxParameters = 15; static constexpr size_t MaxParameterBytes = MaxParameters * sizeof(uint64_t); - static const uint32_t LLDB_FLAG = 'LLDB'; + static const uint32_t LLDB_FLAG = 0x4C4C4442; // ASCII for 'LLDB' support::ulittle32_t ExceptionCode; support::ulittle32_t ExceptionFlags; -- GitLab From 12bcea3292a1559ecad549b5d34c8abcf19f2626 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Fri, 18 Oct 2024 20:16:56 +0700 Subject: [PATCH 053/511] [RISCV][TTI] Recognize CONCAT_VECTORS if a shufflevector mask is multiple insert subvector. (#111459) reference: https://github.com/llvm/llvm-project/pull/110457 --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 45 +++++++++++++++++++ .../RISCV/fixed-vector-insert-subvector.ll | 18 ++++++++ .../RISCV/remarks-insert-into-small-vector.ll | 2 +- .../RISCV/revec-getGatherCost.ll | 4 +- 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/RISCV/fixed-vector-insert-subvector.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index df5c6b522e67..395baa5f1aab 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -343,6 +343,28 @@ RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { /*AddressSpace=*/0, CostKind); } +static bool isRepeatedConcatMask(ArrayRef Mask, int &SubVectorSize) { + unsigned Size = Mask.size(); + if (!isPowerOf2_32(Size)) + return false; + for (unsigned I = 0; I != Size; ++I) { + if (static_cast(Mask[I]) == I) + continue; + if (Mask[I] != 0) + return false; + if (Size % I != 0) + return false; + for (unsigned J = I + 1; J != Size; ++J) + // Check the pattern is repeated. + if (static_cast(Mask[J]) != J % I) + return false; + SubVectorSize = I; + return true; + } + // That means Mask is <0, 1, 2, 3>. This is not a concatenation. + return false; +} + static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C) { assert((DataVT.getScalarSizeInBits() != 8 || @@ -394,6 +416,29 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, LT.second, CostKind); } } + int SubVectorSize; + if (LT.second.getScalarSizeInBits() != 1 && + isRepeatedConcatMask(Mask, SubVectorSize)) { + InstructionCost Cost = 0; + unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize); + // The cost of extraction from a subvector is 0 if the index is 0. + for (unsigned I = 0; I != NumSlides; ++I) { + unsigned InsertIndex = SubVectorSize * (1 << I); + FixedVectorType *SubTp = + FixedVectorType::get(Tp->getElementType(), InsertIndex); + FixedVectorType *DestTp = + FixedVectorType::getDoubleElementsVectorType(SubTp); + std::pair DestLT = + getTypeLegalizationCost(DestTp); + // Add the cost of whole vector register move because the + // destination vector register group for vslideup cannot overlap the + // source. + Cost += DestLT.first * TLI->getLMULCost(DestLT.second); + Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {}, + CostKind, InsertIndex, SubTp); + } + return Cost; + } } // vrgather + cost of generating the mask constant. // We model this for an unknown mask with a single vrgather. diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-insert-subvector.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-insert-subvector.ll new file mode 100644 index 000000000000..47a2af92aee9 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-insert-subvector.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v | FileCheck %s + +define void @test() { +; CHECK-LABEL: 'test' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %0 = shufflevector <8 x float> poison, <8 x float> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = shufflevector <4 x i16> poison, <4 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = shufflevector <4 x float> poison, <4 x float> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = shufflevector <2 x i1> poison, <2 x i1> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %0 = shufflevector <8 x float> poison, <8 x float> poison, <16 x i32> + %1 = shufflevector <4 x i16> poison, <4 x i16> poison, <16 x i32> + %2 = shufflevector <4 x float> poison, <4 x float> poison, <8 x i32> + %3 = shufflevector <2 x i1> poison, <2 x i1> poison, <4 x i32> + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll index 09612444afd2..4788e1ef7155 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '0' +; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '9' diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll index 995cd7cfbc88..a0cb52a853b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll @@ -8,7 +8,7 @@ ; YAML: Function: test1 ; YAML: Args: ; YAML: - String: 'Stores SLP vectorized with cost ' -; YAML: - Cost: '6' +; YAML: - Cost: '4' ; YAML: - String: ' and with tree size ' ; YAML: - TreeSize: '5' @@ -47,7 +47,7 @@ declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) ; YAML: Function: test2 ; YAML: Args: ; YAML: - String: 'Stores SLP vectorized with cost ' -; YAML: - Cost: '16' +; YAML: - Cost: '12' ; YAML: - String: ' and with tree size ' ; YAML: - TreeSize: '5' -- GitLab From c89d731c5dc48e34ec4d081fce7e0c94e212b2f0 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 18 Oct 2024 21:19:02 +0800 Subject: [PATCH 054/511] [LVI] Infer non-zero from equality icmp (#112838) This following pattern is common in loop headers: ``` %101 = sub nuw i64 %78, %98 %103 = icmp eq i64 %78, %98 br i1 %103, label %.thread.i.i, label %.preheader.preheader.i.i .preheader.preheader.i.i: %invariant.umin.i.i = call i64 @llvm.umin.i64(i64 %101, i64 9) %umax.i = call i64 @llvm.umax.i64(i64 %invariant.umin.i.i, i64 1) br label %.preheader.i.i .preheader.i.i: ... %116 = add nuw nsw i64 %.011.i.i, 1 %exitcond.not.i = icmp eq i64 %116, %umax.i br i1 %exitcond.not.i, label %.critedge.i.i, label %.preheader.i.i ``` As `%78` is not equal to `%98` in BB `.preheader.preheader.i.i`, we can prove `%101` is non-zero. Then we can simplify the loop exit condition. Addresses regression introduced by https://github.com/llvm/llvm-project/pull/112742. --- llvm/lib/Analysis/LazyValueInfo.cpp | 14 ++ .../CorrelatedValuePropagation/umax.ll | 168 ++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/umax.ll diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 10ad4708596c..42b04046ce10 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1188,6 +1188,20 @@ std::optional LazyValueInfoImpl::getValueFromICmpCondition( return ValueLatticeElement::getRange(*CR); } + // a - b or ptrtoint(a) - ptrtoint(b) ==/!= 0 if a ==/!= b + Value *X, *Y; + if (ICI->isEquality() && match(Val, m_Sub(m_Value(X), m_Value(Y)))) { + // Peek through ptrtoints + match(X, m_PtrToIntSameSize(DL, m_Value(X))); + match(Y, m_PtrToIntSameSize(DL, m_Value(Y))); + if ((X == LHS && Y == RHS) || (X == RHS && Y == LHS)) { + Constant *NullVal = Constant::getNullValue(Val->getType()); + if (EdgePred == ICmpInst::ICMP_EQ) + return ValueLatticeElement::get(NullVal); + return ValueLatticeElement::getNot(NullVal); + } + } + return ValueLatticeElement::getOverdefined(); } diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/umax.ll b/llvm/test/Transforms/CorrelatedValuePropagation/umax.ll new file mode 100644 index 000000000000..4fca708c1838 --- /dev/null +++ b/llvm/test/Transforms/CorrelatedValuePropagation/umax.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s + +target datalayout = "p:32:32" + +define i32 @infer_range_from_dom_equality(i32 %x, i32 %y) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_equality( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[X]], [[Y]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i32 1 +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: ret i32 [[SUB]] +; +entry: + %cond = icmp eq i32 %x, %y + %sub = sub i32 %x, %y + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} + +define i32 @infer_range_from_dom_equality_commuted1(i32 %x, i32 %y) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_equality_commuted1( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[X]], [[Y]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i32 1 +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: ret i32 [[SUB]] +; +entry: + %cond = icmp eq i32 %x, %y + %sub = sub i32 %y, %x + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} + +define i32 @infer_range_from_dom_equality_commuted2(i32 %x, i32 %y) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_equality_commuted2( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[Y]], [[X]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i32 1 +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: ret i32 [[SUB]] +; +entry: + %cond = icmp eq i32 %y, %x + %sub = sub i32 %x, %y + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} + +define i32 @infer_range_from_dom_equality_ptrdiff(ptr %x, ptr %y) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_equality_ptrdiff( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[X]], [[Y]] +; CHECK-NEXT: [[XI:%.*]] = ptrtoint ptr [[X]] to i32 +; CHECK-NEXT: [[YI:%.*]] = ptrtoint ptr [[Y]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[XI]], [[YI]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i32 1 +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: ret i32 [[SUB]] +; + %cond = icmp eq ptr %x, %y + %xi = ptrtoint ptr %x to i32 + %yi = ptrtoint ptr %y to i32 + %sub = sub i32 %xi, %yi + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} + +; Negative tests + +define i32 @infer_range_from_dom_slt(i32 %x, i32 %y) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_slt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[X]], [[Y]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[MAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[SUB]], i32 1) +; CHECK-NEXT: ret i32 [[MAX1]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[MAX2:%.*]] = call i32 @llvm.umax.i32(i32 [[SUB]], i32 1) +; CHECK-NEXT: ret i32 [[MAX2]] +; +entry: + %cond = icmp slt i32 %x, %y + %sub = sub i32 %x, %y + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} + +define i32 @infer_range_from_dom_equality_not_match(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: define range(i32 1, 0) i32 @infer_range_from_dom_equality_not_match( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[X]], [[Z]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[MAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[SUB]], i32 1) +; CHECK-NEXT: ret i32 [[MAX1]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[MAX2:%.*]] = call i32 @llvm.umax.i32(i32 [[SUB]], i32 1) +; CHECK-NEXT: ret i32 [[MAX2]] +; +entry: + %cond = icmp eq i32 %x, %z + %sub = sub i32 %x, %y + br i1 %cond, label %if.then, label %if.else + +if.then: + %max1 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max1 + +if.else: + %max2 = call i32 @llvm.umax.i32(i32 %sub, i32 1) + ret i32 %max2 +} -- GitLab From 3bc765dbbf9bf0eceab1c9679b9f761b3f760d56 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 18 Oct 2024 14:19:42 +0100 Subject: [PATCH 055/511] [lldb][test] Add test for ASTImporter's name conflict resolution (#112566) This is a reduced test case from a crash we've observed in the past. The assertion that this test triggers is: ``` Assertion failed: ((Pos == ImportedDecls.end() || Pos->second == To) && "Try to import an already imported Decl"), function MapImported, file ASTImporter.cpp, line 10494. ``` In a non-asserts build we crash later on in the ASTImporter. The root cause is, as the assertion above points out, that we erroneously replace an existing `From->To` decl mapping with a `To` decl that isn't complete. Then we try to complete it but it has no definition and we dereference a nullptr. The reason this happens is basically what's been described in https://reviews.llvm.org/D67803?id=220956#1676588 The dylib contains a definition of `Service` which is different to the one in the main executable. When we start dumping the children of the variable we're printing, we start completing it's members, `ASTImport`ing fields in the process. When the ASTImporter realizes there's been a name conflict (i.e., a structural mismatch on the `Service` type) it would usually report back an error. However, LLDB uses `ODRHandlingType::Liberal`, which means we create a new decl for the ODR'd type instead of re-using the previously mapped decl. Eventually this leads us to crash. Ideally we'd be using `ODRHandlingType::Conservative` and warn/error, though LLDB relies on this in some cases (particularly for distinguishing template specializations, though maybe there's better a way to deal with those). We should really warn the user when this happens and not crash. To avoid the crash we'd need to know to not create a decl for the ODR violation, and instead re-use the definition we've previously seen. Though I'm not yet sure that's viable for all of LLDB's use-cases (where ODR violations might legimiately occur in a program, e.g., with opaque definitions, etc.). --- .../lang/cpp/odr-handling-with-dylib/Makefile | 6 ++++ .../TestOdrHandlingWithDylib.py | 29 +++++++++++++++++++ .../lang/cpp/odr-handling-with-dylib/main.cpp | 11 +++++++ .../cpp/odr-handling-with-dylib/plugin.cpp | 14 +++++++++ .../lang/cpp/odr-handling-with-dylib/plugin.h | 9 ++++++ .../cpp/odr-handling-with-dylib/service.cpp | 15 ++++++++++ .../cpp/odr-handling-with-dylib/service.h | 20 +++++++++++++ 7 files changed, 104 insertions(+) create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/Makefile create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/TestOdrHandlingWithDylib.py create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/main.cpp create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.cpp create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.h create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/service.cpp create mode 100644 lldb/test/API/lang/cpp/odr-handling-with-dylib/service.h diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/Makefile b/lldb/test/API/lang/cpp/odr-handling-with-dylib/Makefile new file mode 100644 index 000000000000..91eadaa37282 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/Makefile @@ -0,0 +1,6 @@ +CXX_SOURCES := main.cpp service.cpp + +DYLIB_CXX_SOURCES := plugin.cpp +DYLIB_NAME := plugin + +include Makefile.rules diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/TestOdrHandlingWithDylib.py b/lldb/test/API/lang/cpp/odr-handling-with-dylib/TestOdrHandlingWithDylib.py new file mode 100644 index 000000000000..f67d933f6ae5 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/TestOdrHandlingWithDylib.py @@ -0,0 +1,29 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class OdrHandlingWithDylibTestCase(TestBase): + @skipIf( + bugnumber="https://github.com/llvm/llvm-project/issues/50375, rdar://135551810" + ) + def test(self): + """ + Tests that the expression evaluator is able to deal with types + whose definitions conflict across multiple LLDB modules (in this + case the definition for 'class Service' in the main executable + has an additional field compared to the definition found in the + dylib). This causes the ASTImporter to detect a name conflict + while importing 'Service'. With LLDB's liberal ODRHandlingType + the ASTImporter happily creates a conflicting AST node for + 'Service' in the scratch ASTContext, leading to a crash down + the line. + """ + self.build() + + lldbutil.run_to_source_breakpoint( + self, "plugin_entry", lldb.SBFileSpec("plugin.cpp") + ) + + self.expect_expr("*gProxyThis") diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/main.cpp b/lldb/test/API/lang/cpp/odr-handling-with-dylib/main.cpp new file mode 100644 index 000000000000..f3372e0fbe70 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/main.cpp @@ -0,0 +1,11 @@ +#include "plugin.h" + +#define HIDE_FROM_PLUGIN 1 +#include "service.h" + +int main() { + exported(); + plugin_init(); + plugin_entry(); + return 0; +} diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.cpp b/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.cpp new file mode 100644 index 000000000000..190388000a3c --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.cpp @@ -0,0 +1,14 @@ +#include "plugin.h" +#include "service.h" + +struct Proxy : public Service { + State *proxyState; +}; + +Proxy *gProxyThis = 0; + +extern "C" { +void plugin_init() { gProxyThis = new Proxy; } + +void plugin_entry() {} +} diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.h b/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.h new file mode 100644 index 000000000000..9d4ba5df5a83 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/plugin.h @@ -0,0 +1,9 @@ +#ifndef PLUGIN_H_IN +#define PLUGIN_H_IN + +extern "C" { +void plugin_entry(void); +void plugin_init(void); +} + +#endif // PLUGIN_H_IN diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.cpp b/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.cpp new file mode 100644 index 000000000000..6302a4548349 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.cpp @@ -0,0 +1,15 @@ +#define HIDE_FROM_PLUGIN 1 +#include "service.h" + +struct ServiceAux { + Service *Owner; +}; + +struct Service::State {}; + +void exported() { + // Make sure debug-info for definition of Service is + // emitted in this CU. + Service service; + service.start(0); +} diff --git a/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.h b/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.h new file mode 100644 index 000000000000..37c6b9aeb2d9 --- /dev/null +++ b/lldb/test/API/lang/cpp/odr-handling-with-dylib/service.h @@ -0,0 +1,20 @@ +#ifndef SERVICE_H_IN +#define SERVICE_H_IN + +struct ServiceAux; + +struct Service { + struct State; + bool start(State *) { return true; } + +#ifdef HIDE_FROM_PLUGIN + int __resv1; +#endif // !HIDE_FROM_PLUGIN + + Service *__owner; + ServiceAux *aux; +}; + +void exported(); + +#endif -- GitLab From b7bc1d07d3e1b2d6db102d881f8ad1083797f319 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 18 Oct 2024 14:32:31 +0100 Subject: [PATCH 056/511] [CodeGen] Fix return type of PHI_iterator::getIncomingValue. NFC. This is supposed to match ValT aka Register. --- llvm/lib/CodeGen/MachineSSAUpdater.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp index 4cbb6ad3128b..c7a673b12d8c 100644 --- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp +++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp @@ -286,7 +286,7 @@ public: bool operator==(const PHI_iterator& x) const { return idx == x.idx; } bool operator!=(const PHI_iterator& x) const { return !operator==(x); } - unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); } + Register getIncomingValue() { return PHI->getOperand(idx).getReg(); } MachineBasicBlock *getIncomingBlock() { return PHI->getOperand(idx+1).getMBB(); -- GitLab From af90e7c5161de9a36af768dd5c9d73464e0eed64 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Fri, 18 Oct 2024 15:42:54 +0200 Subject: [PATCH 057/511] [Clang] Fix an assertion in expression recovery (#112888) Explicit object member function calls are not modelled as member calls Fixes #112559 --- clang/docs/ReleaseNotes.rst | 3 ++- clang/lib/AST/Expr.cpp | 2 +- clang/test/SemaCXX/cxx2b-deducing-this.cpp | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b34da2d75570..b7a6ace8bb89 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -419,7 +419,7 @@ Improvements to Clang's diagnostics - The warning for an unsupported type for a named register variable is now phrased ``unsupported type for named register variable``, instead of ``bad type for named register variable``. This makes it clear that the type is not supported at all, rather than being suboptimal in some way the error fails to mention (#GH111550). - + - Clang now emits a ``-Wdepredcated-literal-operator`` diagnostic, even if the name was a reserved name, which we improperly allowed to suppress the diagnostic. @@ -538,6 +538,7 @@ Bug Fixes to C++ Support certain situations. (#GH47400), (#GH90896) - Fix erroneous templated array size calculation leading to crashes in generated code. (#GH41441) - During the lookup for a base class name, non-type names are ignored. (#GH16855) +- Fix a crash when recovering an invalid expression involving an explicit object member conversion operator. (#GH112559) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 9ecbf121e3fc..66db6263cb1b 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -1989,7 +1989,7 @@ Expr *CastExpr::getSubExprAsWritten() { SubExpr = IgnoreExprNodes(cast(SubExpr)->getArg(0), ignoreImplicitSemaNodes); } else if (E->getCastKind() == CK_UserDefinedConversion) { - assert((isa(SubExpr) || isa(SubExpr)) && + assert((isa(SubExpr)) && "Unexpected SubExpr for CK_UserDefinedConversion."); if (auto *MCE = dyn_cast(SubExpr)) SubExpr = MCE->getImplicitObjectArgument(); diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 2a984a75f37d..520052a89d18 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -1097,3 +1097,20 @@ struct C4 { // expected-warning {{volatile-qualified parameter type 'const volatile C4' is deprecated}} }; } + + +namespace GH112559 { +struct Wrap {}; +struct S { + constexpr operator Wrap (this const S& self) { + return Wrap{}; + }; + constexpr int operator <<(this Wrap self, int i) { + return 0; + } +}; +// Purposefully invalid expression to check an assertion in the +// expression recovery machinery. +static_assert((S{} << 11) == a); +// expected-error@-1 {{use of undeclared identifier 'a'}} +} -- GitLab From b0dbd2ca5b52a277560a70a2864ea9949f1e3794 Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Oct 2024 09:58:15 -0400 Subject: [PATCH 058/511] [libc][math] Add option to set a specific exponent for frexp with Inf/NaN inputs. (#112387) In IEEE 754 and C standards, when calling `frexp` with Inf/Nan inputs, the exponent result is unspecified. In this case, FreeBSD libc and musl just passthrough `exp`, while glibc, FreeBSD libm set exp = 0, and MSVC set exp = -1. By default, LLVM libc will passthrough `exp` just as FreeBSD libc and musl, but we also allow users to explicitly choose the return exp value in this case for compatibility with other libc. Notice that, gcc did generate passthrough `exp` for `frexp(NaN/Inf, exp)`: https://godbolt.org/z/sM8fEej4E --- libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 11 ++++++++--- libc/config/config.json | 4 ++++ libc/docs/configure.rst | 1 + libc/src/__support/FPUtil/ManipulationFunctions.h | 10 +++++++++- libc/test/src/math/smoke/FrexpTest.h | 11 +++++++++++ 5 files changed, 33 insertions(+), 4 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 737ac87f4c7a..0c658c6866c4 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -79,6 +79,14 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_ADD_NULL_CHECKS") endif() + if(NOT "${LIBC_CONF_FREXP_INF_NAN_EXPONENT}" STREQUAL "") + list(APPEND config_options "-DLIBC_FREXP_INF_NAN_EXPONENT=${LIBC_CONF_FREXP_INF_NAN_EXPONENT}") + endif() + + if(LIBC_CONF_MATH_OPTIMIZATIONS) + list(APPEND compile_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}") + endif() + set(${output_var} ${config_options} PARENT_SCOPE) endfunction(_get_compile_options_from_config) @@ -170,9 +178,6 @@ function(_get_common_compile_options output_var flags) list(APPEND compile_options "-Wthread-safety") list(APPEND compile_options "-Wglobal-constructors") endif() - if(LIBC_CONF_MATH_OPTIMIZATIONS) - list(APPEND compile_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}") - endif() elseif(MSVC) list(APPEND compile_options "/EHs-c-") list(APPEND compile_options "/GR-") diff --git a/libc/config/config.json b/libc/config/config.json index 2e4f878778e6..9a5d5c3c68da 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -87,6 +87,10 @@ "LIBC_CONF_MATH_OPTIMIZATIONS": { "value": 0, "doc": "Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST." + }, + "LIBC_CONF_FREXP_INF_NAN_EXPONENT": { + "value": "", + "doc": "The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configue an explicit exp value for Inf/NaN inputs." } }, "qsort": { diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 867bb807d10a..e225e6b566df 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -33,6 +33,7 @@ to learn about the defaults for your platform and target. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** + - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: Set the specific exp value for Inf/NaN inputs. - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST. * **"printf" options** - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends. diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h index 66bfe2aa377f..9c10011ccd20 100644 --- a/libc/src/__support/FPUtil/ManipulationFunctions.h +++ b/libc/src/__support/FPUtil/ManipulationFunctions.h @@ -31,8 +31,16 @@ namespace fputil { template , int> = 0> LIBC_INLINE T frexp(T x, int &exp) { FPBits bits(x); - if (bits.is_inf_or_nan()) + if (bits.is_inf_or_nan()) { +#ifdef LIBC_FREXP_INF_NAN_EXPONENT + // The value written back to the second parameter when calling + // frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified in the standard. + // Set the exp value for Inf/NaN inputs explicitly to + // LIBC_FREXP_INF_NAN_EXPONENT if it is defined. + exp = LIBC_FREXP_INF_NAN_EXPONENT; +#endif // LIBC_FREXP_INF_NAN_EXPONENT return x; + } if (bits.is_zero()) { exp = 0; return x; diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h index 11641fc6743c..3fb3a2e1688c 100644 --- a/libc/test/src/math/smoke/FrexpTest.h +++ b/libc/test/src/math/smoke/FrexpTest.h @@ -21,8 +21,19 @@ public: void testSpecialNumbers(FrexpFunc func) { int exponent; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, &exponent)); +#ifdef LIBC_FREXP_INF_NAN_EXPONENT + EXPECT_EQ(LIBC_FREXP_INF_NAN_EXPONENT, exponent); +#endif // LIBC_FREXP_INF_NAN_EXPONENT + EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, &exponent)); +#ifdef LIBC_FREXP_INF_NAN_EXPONENT + EXPECT_EQ(LIBC_FREXP_INF_NAN_EXPONENT, exponent); +#endif // LIBC_FREXP_INF_NAN_EXPONENT + EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, &exponent)); +#ifdef LIBC_FREXP_INF_NAN_EXPONENT + EXPECT_EQ(LIBC_FREXP_INF_NAN_EXPONENT, exponent); +#endif // LIBC_FREXP_INF_NAN_EXPONENT EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero, &exponent)); EXPECT_EQ(exponent, 0); -- GitLab From 90bc60c5a82a596327ddc6956436abf146b44a7a Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 18 Oct 2024 10:11:07 -0400 Subject: [PATCH 059/511] [libc++] Re-add attribute macro to clang-format (#112746) That macro was removed incorrectly from the clang-format file because it had a typo in its name. However, the macro with the right name is still being used in the library (sadly, in a single place). --- libcxx/.clang-format | 1 + 1 file changed, 1 insertion(+) diff --git a/libcxx/.clang-format b/libcxx/.clang-format index a6154c7c4a2b..f548119652c1 100644 --- a/libcxx/.clang-format +++ b/libcxx/.clang-format @@ -30,6 +30,7 @@ AttributeMacros: [ '_LIBCPP_DEPRECATED_IN_CXX20', '_LIBCPP_DEPRECATED_IN_CXX23', '_LIBCPP_DEPRECATED', + '_LIBCPP_DISABLE_EXTENSION_WARNING', '_LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION', '_LIBCPP_EXPORTED_FROM_ABI', '_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS', -- GitLab From 67e84213f59e1f9485d15421bdb7243d25cee07e Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Fri, 18 Oct 2024 16:15:33 +0200 Subject: [PATCH 060/511] [analyzer][Solver] Teach SymbolicRangeInferrer about commutativity (2/2) (#112887) This patch should not introduce much overhead as it only does one more constraint map lookup, which is really quick. Depends on #112583 --- .../Core/RangeConstraintManager.cpp | 17 ++++++++++ clang/test/Analysis/unary-sym-expr.c | 33 +++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp index ecf7974c8386..f0311b7028f5 100644 --- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp @@ -1249,6 +1249,8 @@ public: // calculate the effective range set by intersecting the range set // for A - B and the negated range set of B - A. getRangeForNegatedSymSym(SSE), + // If commutative, we may have constaints for the commuted variant. + getRangeCommutativeSymSym(SSE), // If Sym is a comparison expression (except <=>), // find any other comparisons with the same operands. // See function description. @@ -1485,6 +1487,21 @@ private: Sym->getType()); } + std::optional getRangeCommutativeSymSym(const SymSymExpr *SSE) { + auto Op = SSE->getOpcode(); + bool IsCommutative = llvm::is_contained( + // ==, !=, |, &, +, *, ^ + {BO_EQ, BO_NE, BO_Or, BO_And, BO_Add, BO_Mul, BO_Xor}, Op); + if (!IsCommutative) + return std::nullopt; + + SymbolRef Commuted = State->getSymbolManager().getSymSymExpr( + SSE->getRHS(), Op, SSE->getLHS(), SSE->getType()); + if (const RangeSet *Range = getConstraint(State, Commuted)) + return *Range; + return std::nullopt; + } + // Returns ranges only for binary comparison operators (except <=>) // when left and right operands are symbolic values. // Finds any other comparisons with the same operands. diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c index 7c4774f3cca8..92e11b295bee 100644 --- a/clang/test/Analysis/unary-sym-expr.c +++ b/clang/test/Analysis/unary-sym-expr.c @@ -29,12 +29,39 @@ int test(int x, int y) { return 42; } -void test_svalbuilder_simplification(int x, int y) { +void test_svalbuilder_simplification_add(int x, int y) { if (x + y != 3) return; clang_analyzer_eval(-(x + y) == -3); // expected-warning{{TRUE}} - // FIXME Commutativity is not supported yet. - clang_analyzer_eval(-(y + x) == -3); // expected-warning{{UNKNOWN}} + clang_analyzer_eval(-(y + x) == -3); // expected-warning{{TRUE}} +} + +void test_svalbuilder_simplification_mul(int x, int y) { + if (x * y != 3) + return; + clang_analyzer_eval(-(x * y) == -3); // expected-warning{{TRUE}} + clang_analyzer_eval(-(y * x) == -3); // expected-warning{{TRUE}} +} + +void test_svalbuilder_simplification_and(int x, int y) { + if ((x & y) != 3) + return; + clang_analyzer_eval(-(x & y) == -3); // expected-warning{{TRUE}} + clang_analyzer_eval(-(y & x) == -3); // expected-warning{{TRUE}} +} + +void test_svalbuilder_simplification_or(int x, int y) { + if ((x | y) != 3) + return; + clang_analyzer_eval(-(x | y) == -3); // expected-warning{{TRUE}} + clang_analyzer_eval(-(y | x) == -3); // expected-warning{{TRUE}} +} + +void test_svalbuilder_simplification_xor(int x, int y) { + if ((x ^ y) != 3) + return; + clang_analyzer_eval(-(x ^ y) == -3); // expected-warning{{TRUE}} + clang_analyzer_eval(-(y ^ x) == -3); // expected-warning{{TRUE}} } int test_fp(int flag) { -- GitLab From 1b49ee73fc3512551066cd3c6b969fad589c9d5e Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Fri, 18 Oct 2024 16:16:29 +0200 Subject: [PATCH 061/511] [analyzer][Solver][NFC] Cleanup const-correctness inside range-based solver (#112891) --- .../Core/RangeConstraintManager.cpp | 59 +++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp index f0311b7028f5..c39fa81109c8 100644 --- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp @@ -1953,27 +1953,27 @@ public: const llvm::APSInt &To, const llvm::APSInt &Adjustment) override; private: - RangeSet::Factory F; + mutable RangeSet::Factory F; - RangeSet getRange(ProgramStateRef State, SymbolRef Sym); + RangeSet getRange(ProgramStateRef State, SymbolRef Sym) const; ProgramStateRef setRange(ProgramStateRef State, SymbolRef Sym, RangeSet Range); RangeSet getSymLTRange(ProgramStateRef St, SymbolRef Sym, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment); + const llvm::APSInt &Adjustment) const; RangeSet getSymGTRange(ProgramStateRef St, SymbolRef Sym, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment); + const llvm::APSInt &Adjustment) const; RangeSet getSymLERange(ProgramStateRef St, SymbolRef Sym, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment); + const llvm::APSInt &Adjustment) const; RangeSet getSymLERange(llvm::function_ref RS, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment); + const llvm::APSInt &Adjustment) const; RangeSet getSymGERange(ProgramStateRef St, SymbolRef Sym, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment); + const llvm::APSInt &Adjustment) const; }; //===----------------------------------------------------------------------===// @@ -2880,21 +2880,18 @@ ConditionTruthVal RangeConstraintManager::checkNull(ProgramStateRef State, const llvm::APSInt *RangeConstraintManager::getSymVal(ProgramStateRef St, SymbolRef Sym) const { - auto &MutableSelf = const_cast(*this); - return MutableSelf.getRange(St, Sym).getConcreteValue(); + return getRange(St, Sym).getConcreteValue(); } const llvm::APSInt *RangeConstraintManager::getSymMinVal(ProgramStateRef St, SymbolRef Sym) const { - auto &MutableSelf = const_cast(*this); - RangeSet Range = MutableSelf.getRange(St, Sym); + RangeSet Range = getRange(St, Sym); return Range.isEmpty() ? nullptr : &Range.getMinValue(); } const llvm::APSInt *RangeConstraintManager::getSymMaxVal(ProgramStateRef St, SymbolRef Sym) const { - auto &MutableSelf = const_cast(*this); - RangeSet Range = MutableSelf.getRange(St, Sym); + RangeSet Range = getRange(St, Sym); return Range.isEmpty() ? nullptr : &Range.getMaxValue(); } @@ -3039,7 +3036,7 @@ RangeConstraintManager::removeDeadBindings(ProgramStateRef State, } RangeSet RangeConstraintManager::getRange(ProgramStateRef State, - SymbolRef Sym) { + SymbolRef Sym) const { return SymbolicRangeInferrer::inferRange(F, State, Sym); } @@ -3094,10 +3091,10 @@ RangeConstraintManager::assumeSymEQ(ProgramStateRef St, SymbolRef Sym, return setRange(St, Sym, New); } -RangeSet RangeConstraintManager::getSymLTRange(ProgramStateRef St, - SymbolRef Sym, - const llvm::APSInt &Int, - const llvm::APSInt &Adjustment) { +RangeSet +RangeConstraintManager::getSymLTRange(ProgramStateRef St, SymbolRef Sym, + const llvm::APSInt &Int, + const llvm::APSInt &Adjustment) const { // Before we do any real work, see if the value can even show up. APSIntType AdjustmentType(Adjustment); switch (AdjustmentType.testInRange(Int, true)) { @@ -3131,10 +3128,10 @@ RangeConstraintManager::assumeSymLT(ProgramStateRef St, SymbolRef Sym, return setRange(St, Sym, New); } -RangeSet RangeConstraintManager::getSymGTRange(ProgramStateRef St, - SymbolRef Sym, - const llvm::APSInt &Int, - const llvm::APSInt &Adjustment) { +RangeSet +RangeConstraintManager::getSymGTRange(ProgramStateRef St, SymbolRef Sym, + const llvm::APSInt &Int, + const llvm::APSInt &Adjustment) const { // Before we do any real work, see if the value can even show up. APSIntType AdjustmentType(Adjustment); switch (AdjustmentType.testInRange(Int, true)) { @@ -3168,10 +3165,10 @@ RangeConstraintManager::assumeSymGT(ProgramStateRef St, SymbolRef Sym, return setRange(St, Sym, New); } -RangeSet RangeConstraintManager::getSymGERange(ProgramStateRef St, - SymbolRef Sym, - const llvm::APSInt &Int, - const llvm::APSInt &Adjustment) { +RangeSet +RangeConstraintManager::getSymGERange(ProgramStateRef St, SymbolRef Sym, + const llvm::APSInt &Int, + const llvm::APSInt &Adjustment) const { // Before we do any real work, see if the value can even show up. APSIntType AdjustmentType(Adjustment); switch (AdjustmentType.testInRange(Int, true)) { @@ -3208,7 +3205,7 @@ RangeConstraintManager::assumeSymGE(ProgramStateRef St, SymbolRef Sym, RangeSet RangeConstraintManager::getSymLERange(llvm::function_ref RS, const llvm::APSInt &Int, - const llvm::APSInt &Adjustment) { + const llvm::APSInt &Adjustment) const { // Before we do any real work, see if the value can even show up. APSIntType AdjustmentType(Adjustment); switch (AdjustmentType.testInRange(Int, true)) { @@ -3234,10 +3231,10 @@ RangeConstraintManager::getSymLERange(llvm::function_ref RS, return F.intersect(Default, Lower, Upper); } -RangeSet RangeConstraintManager::getSymLERange(ProgramStateRef St, - SymbolRef Sym, - const llvm::APSInt &Int, - const llvm::APSInt &Adjustment) { +RangeSet +RangeConstraintManager::getSymLERange(ProgramStateRef St, SymbolRef Sym, + const llvm::APSInt &Int, + const llvm::APSInt &Adjustment) const { return getSymLERange([&] { return getRange(St, Sym); }, Int, Adjustment); } -- GitLab From 803220db43207254d7fced50dcc0686c4ee65474 Mon Sep 17 00:00:00 2001 From: wldfngrs Date: Fri, 18 Oct 2024 15:24:19 +0100 Subject: [PATCH 062/511] [libc] changed the return cast from static_cast to fputil::cast in exp10f16.cpp. (#112889) switch to fputil::cast to fix rounding with compiler-rt --- libc/src/math/generic/exp10f16.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp index f7a8ee3245ed..006dd5c55442 100644 --- a/libc/src/math/generic/exp10f16.cpp +++ b/libc/src/math/generic/exp10f16.cpp @@ -124,7 +124,7 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { // 10^x = 2^((hi + mid) * log2(10)) * 10^lo auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x); - return static_cast(exp2_hi_mid * exp10_lo); + return fputil::cast(exp2_hi_mid * exp10_lo); } } // namespace LIBC_NAMESPACE_DECL -- GitLab From 783901bd2008cbe835ef394f6c3147013604e95f Mon Sep 17 00:00:00 2001 From: tltao Date: Fri, 18 Oct 2024 10:26:50 -0400 Subject: [PATCH 063/511] [SystemZ] Rename SystemZ ATT Asm dialect to GNU Asm dialect (#112800) The ATT assembler dialect on SystemZ seems to have been taken from the existing ATT/Intel code. However, on SystemZ, ATT does not hold any meaning. In reality, we are splitting the difference between GNU Asm syntax and HLASM Asm syntax, so it makes sense to rename ATT to GNU instead. Co-authored-by: Tony Tao --- .../SystemZ/AsmParser/SystemZAsmParser.cpp | 16 ++++++------ .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 2 +- .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.h | 2 +- llvm/lib/Target/SystemZ/SystemZ.td | 6 ++--- .../lib/Target/SystemZ/SystemZInstrFormats.td | 26 +++++++++---------- llvm/lib/Target/SystemZ/SystemZInstrInfo.td | 6 ++--- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 5b26ba08dbdb..f0a85645b862 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -442,7 +442,7 @@ private: bool parseOperand(OperandVector &Operands, StringRef Mnemonic); - // Both the hlasm and att variants still rely on the basic gnu asm + // Both the hlasm and gnu variants still rely on the basic gnu asm // format with respect to inputs, clobbers, outputs etc. // // However, calling the overriden getAssemblerDialect() method in @@ -475,8 +475,8 @@ private: // Are we parsing using the AD_HLASM dialect? inline bool isParsingHLASM() { return getMAIAssemblerDialect() == AD_HLASM; } - // Are we parsing using the AD_ATT dialect? - inline bool isParsingATT() { return getMAIAssemblerDialect() == AD_ATT; } + // Are we parsing using the AD_GNU dialect? + inline bool isParsingGNU() { return getMAIAssemblerDialect() == AD_GNU; } public: SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, @@ -848,7 +848,7 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, } // Handle register names of the form % - if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) { + if (isParsingGNU() && Parser.getTok().is(AsmToken::Percent)) { if (parseRegister(Reg, /*RequirePercent=*/true)) return ParseStatus::Failure; @@ -1029,7 +1029,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1, if (getLexer().is(AsmToken::LParen)) { Parser.Lex(); - if (isParsingATT() && getLexer().is(AsmToken::Percent)) { + if (isParsingGNU() && getLexer().is(AsmToken::Percent)) { // Parse the first register. HaveReg1 = true; if (parseRegister(Reg1, /*RequirePercent=*/true)) @@ -1072,7 +1072,7 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1, if (parseIntegerRegister(Reg2, RegGR)) return true; } else { - if (isParsingATT() && parseRegister(Reg2, /*RequirePercent=*/true)) + if (isParsingGNU() && parseRegister(Reg2, /*RequirePercent=*/true)) return true; } } @@ -1490,7 +1490,7 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, // a context-dependent parse routine, which gives the required register // class. The code is here to mop up other cases, like those where // the instruction isn't recognized. - if (isParsingATT() && Parser.getTok().is(AsmToken::Percent)) { + if (isParsingGNU() && Parser.getTok().is(AsmToken::Percent)) { Register Reg; if (parseRegister(Reg, /*RequirePercent=*/true)) return true; @@ -1672,7 +1672,7 @@ ParseStatus SystemZAsmParser::parsePCRel(OperandVector &Operands, } bool SystemZAsmParser::isLabel(AsmToken &Token) { - if (isParsingATT()) + if (isParsingGNU()) return true; // HLASM labels are ordinary symbols. diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index 66555fa06b06..3a1d01cac2b9 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -13,7 +13,7 @@ using namespace llvm; SystemZMCAsmInfoELF::SystemZMCAsmInfoELF(const Triple &TT) { - AssemblerDialect = AD_ATT; + AssemblerDialect = AD_GNU; CalleeSaveStackSlotSize = 8; CodePointerSize = 8; Data64bitsDirective = "\t.quad\t"; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h index b2f191424d01..58b9a3dd652e 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h @@ -15,7 +15,7 @@ namespace llvm { class Triple; -enum SystemZAsmDialect { AD_ATT = 0, AD_HLASM = 1 }; +enum SystemZAsmDialect { AD_GNU = 0, AD_HLASM = 1 }; class SystemZMCAsmInfoELF : public MCAsmInfoELF { public: diff --git a/llvm/lib/Target/SystemZ/SystemZ.td b/llvm/lib/Target/SystemZ/SystemZ.td index e18deede544a..9d0c77eafa2e 100644 --- a/llvm/lib/Target/SystemZ/SystemZ.td +++ b/llvm/lib/Target/SystemZ/SystemZ.td @@ -67,11 +67,11 @@ def SystemZAsmParser : AsmParser { let ShouldEmitMatchRegisterName = 0; } -def ATTAsmParserVariant : AsmParserVariant { +def GNUAsmParserVariant : AsmParserVariant { int Variant = 0; // Variant name. - string Name = "att"; + string Name = "gnu"; } def HLASMAsmParserVariant : AsmParserVariant { @@ -88,6 +88,6 @@ def HLASMAsmParserVariant : AsmParserVariant { def SystemZ : Target { let InstructionSet = SystemZInstrInfo; let AssemblyParsers = [SystemZAsmParser]; - let AssemblyParserVariants = [ATTAsmParserVariant, HLASMAsmParserVariant]; + let AssemblyParserVariants = [GNUAsmParserVariant, HLASMAsmParserVariant]; let AllowRegisterRenaming = 1; } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 9a12718db7cb..50f636a8d968 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2050,7 +2050,7 @@ class CondVariant ccmaskin, string suffixin, bit alternatein, bit alternate = alternatein; // Whether this needs be to restricted to a specific dialect. - // Valid values are "att" and "hlasm", which when passed in + // Valid values are "gnu" and "hlasm", which when passed in // will set AsmVariantName. string asmvariant = asmvariantin; } @@ -2063,20 +2063,20 @@ def CondAlways : CondVariant<15, "", 0>; def CondVariantO : CondVariant<1, "o", 0>; def CondVariantH : CondVariant<2, "h", 0>; def CondVariantP : CondVariant<2, "p", 1>; -def CondVariantNLE : CondVariant<3, "nle", 0, "att">; +def CondVariantNLE : CondVariant<3, "nle", 0, "gnu">; def CondVariantL : CondVariant<4, "l", 0>; def CondVariantM : CondVariant<4, "m", 1>; -def CondVariantNHE : CondVariant<5, "nhe", 0, "att">; -def CondVariantLH : CondVariant<6, "lh", 0, "att">; +def CondVariantNHE : CondVariant<5, "nhe", 0, "gnu">; +def CondVariantLH : CondVariant<6, "lh", 0, "gnu">; def CondVariantNE : CondVariant<7, "ne", 0>; def CondVariantNZ : CondVariant<7, "nz", 1>; def CondVariantE : CondVariant<8, "e", 0>; def CondVariantZ : CondVariant<8, "z", 1>; -def CondVariantNLH : CondVariant<9, "nlh", 0, "att">; -def CondVariantHE : CondVariant<10, "he", 0, "att">; +def CondVariantNLH : CondVariant<9, "nlh", 0, "gnu">; +def CondVariantHE : CondVariant<10, "he", 0, "gnu">; def CondVariantNL : CondVariant<11, "nl", 0>; def CondVariantNM : CondVariant<11, "nm", 1>; -def CondVariantLE : CondVariant<12, "le", 0, "att">; +def CondVariantLE : CondVariant<12, "le", 0, "gnu">; def CondVariantNH : CondVariant<13, "nh", 0>; def CondVariantNP : CondVariant<13, "np", 1>; def CondVariantNO : CondVariant<14, "no", 0>; @@ -2093,16 +2093,16 @@ class CV // and that the low bit of the mask is therefore always 0. This means // that each condition has two names. Conditions "o" and "no" are not used. def IntCondVariantH : CondVariant<2, "h", 0>; -def IntCondVariantNLE : CondVariant<2, "nle", 1, "att">; +def IntCondVariantNLE : CondVariant<2, "nle", 1, "gnu">; def IntCondVariantL : CondVariant<4, "l", 0>; -def IntCondVariantNHE : CondVariant<4, "nhe", 1, "att">; -def IntCondVariantLH : CondVariant<6, "lh", 0, "att">; +def IntCondVariantNHE : CondVariant<4, "nhe", 1, "gnu">; +def IntCondVariantLH : CondVariant<6, "lh", 0, "gnu">; def IntCondVariantNE : CondVariant<6, "ne", 1>; def IntCondVariantE : CondVariant<8, "e", 0>; -def IntCondVariantNLH : CondVariant<8, "nlh", 1, "att">; -def IntCondVariantHE : CondVariant<10, "he", 0, "att">; +def IntCondVariantNLH : CondVariant<8, "nlh", 1, "gnu">; +def IntCondVariantHE : CondVariant<10, "he", 0, "gnu">; def IntCondVariantNL : CondVariant<10, "nl", 1>; -def IntCondVariantLE : CondVariant<12, "le", 0, "att">; +def IntCondVariantLE : CondVariant<12, "le", 0, "gnu">; def IntCondVariantNH : CondVariant<12, "nh", 1>; // A helper class to look up one of the above by name. diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 95ed1a00e603..f3baf896658d 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -121,7 +121,7 @@ def NOPR_bare : InstAlias<"nopr", (NOPR R0D), 0>; def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>; // An alias of BRCL 0, label -// jgnop on att ; jlnop on hlasm +// jgnop on gnu ; jlnop on hlasm def JGNOP : InstAlias<"{jgnop|jlnop}\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>; // Fused compare-and-branch instructions. @@ -2351,12 +2351,12 @@ def JXHG : MnemonicAlias<"jxhg", "brxhg">; def JXLEG : MnemonicAlias<"jxleg", "brxlg">; def BRU : MnemonicAlias<"bru", "j">; -def BRUL : MnemonicAlias<"brul", "jg", "att">; +def BRUL : MnemonicAlias<"brul", "jg", "gnu">; def BRUL_HLASM : MnemonicAlias<"brul", "jlu", "hlasm">; foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE", "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in { defm BRUAsm#V : MnemonicCondBranchAlias , "br#", "j#">; - defm BRULAsm#V : MnemonicCondBranchAlias , "br#l", "jg#", "att">; + defm BRULAsm#V : MnemonicCondBranchAlias , "br#l", "jg#", "gnu">; defm BRUL_HLASMAsm#V : MnemonicCondBranchAlias , "br#l", "jl#", "hlasm">; } -- GitLab From 0a3347dc638594bef802d8148a77052c198ec27b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Fri, 18 Oct 2024 15:27:43 +0100 Subject: [PATCH 064/511] [mlir][linalg] Fix idx comparison in the vectorizer (#112900) Fixes loop comparison condition in the vectorizer. As that logic is used specifically for vectorising `tensor.extract`, I also added a test that violates the assumptions made inside `getTrailingNonUnitLoopDimIdx`, namely that Linalg loops are non-empty. Vectorizer pre-conditions will capture that much earlier making sure that `getTrailingNonUnitLoopDimIdx` is only run when all the assumptions are actually met. Thank you for pointing this out, @pfusik ! --- .../Linalg/Transforms/Vectorization.cpp | 3 ++- .../Linalg/vectorize-tensor-extract.mlir | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index d39c5fcdbc42..e1b97fbf985d 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -863,9 +863,10 @@ static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) { llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) && "For statically shaped Linalg Ops, only one " "non-unit loop dim is expected"); + assert(loopRanges.size() != 0 && "Empty loops, nothing to analyse."); size_t idx = loopRanges.size() - 1; - for (; idx >= 0; idx--) + for (; idx != 0; idx--) if (loopRanges[idx] != 1) break; diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir index 2c56b7139fec..3560ab2312a2 100644 --- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir +++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir @@ -36,6 +36,33 @@ module attributes {transform.with_named_sequence} { } } +// ----- + +#map = affine_map<() -> ()> +func.func @negative_no_loops(%arg0: tensor, %arg1: tensor) -> tensor { + %1 = linalg.generic { + indexing_maps = [#map], + iterator_types = [] + } outs(%arg1 : tensor) { + ^bb0(%arg4: f32): + %2 = tensor.extract %arg0[] : tensor + linalg.yield %2 : f32 + } -> tensor + return %1 : tensor +} +// CHECK-LABEL: func.func @negative_no_loops +// CHECK: tensor.extract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + + // ----- #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -- GitLab From 68efaaafe4c34e332ec1a20382a97f77e575165e Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 18 Oct 2024 15:40:35 +0100 Subject: [PATCH 065/511] [TableGen] Remove unused tokens FalseKW and TrueKW These were introduced in https://reviews.llvm.org/D90635 but never used. --- llvm/lib/TableGen/TGLexer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 4fa4d84d0535..9a6874c89757 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -80,7 +80,6 @@ enum TokKind { Code, Dag, ElseKW, - FalseKW, Field, In, Include, @@ -88,7 +87,6 @@ enum TokKind { List, String, Then, - TrueKW, // Object start tokens. OBJECT_START_FIRST, -- GitLab From c27aae0035d2cf490c02a0cc0e2e1fbe4f12512a Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 18 Oct 2024 09:46:13 -0500 Subject: [PATCH 066/511] [Offload] Fix not copying the buffer identifier of offloading files Summary: This caused an error when copying a file of the same name when multiple architectures needed the file. The buffer identifier which we use for the name in `-save-temps` mode would be empty and create in invalid filename. Copy this correctly now. --- llvm/include/llvm/Object/OffloadBinary.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index 13383d5f07ba..c02aec8d956e 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -165,7 +165,8 @@ public: /// Make a deep copy of this offloading file. OffloadFile copy() const { std::unique_ptr Buffer = MemoryBuffer::getMemBufferCopy( - getBinary()->getMemoryBufferRef().getBuffer()); + getBinary()->getMemoryBufferRef().getBuffer(), + getBinary()->getMemoryBufferRef().getBufferIdentifier()); // This parsing should never fail because it has already been parsed. auto NewBinaryOrErr = OffloadBinary::create(*Buffer); -- GitLab From 62e2c7fb2d18b43149a07526f6a3c0563d50e2fa Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 18 Oct 2024 07:50:22 -0700 Subject: [PATCH 067/511] [LLVM][TableGen] Change all `Init` pointers to const (#112705) This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../utils/TableGen/ClangOptionDocEmitter.cpp | 4 +- llvm/include/llvm/TableGen/Error.h | 4 +- llvm/include/llvm/TableGen/Record.h | 526 ++++++----- llvm/lib/TableGen/DetailedRecordsBackend.cpp | 2 +- llvm/lib/TableGen/Error.cpp | 4 +- llvm/lib/TableGen/Record.cpp | 873 +++++++++--------- llvm/lib/TableGen/TGParser.cpp | 428 ++++----- llvm/lib/TableGen/TGParser.h | 82 +- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 44 +- llvm/utils/TableGen/AsmWriterEmitter.cpp | 6 +- llvm/utils/TableGen/Attributes.cpp | 2 +- .../TableGen/Basic/CodeGenIntrinsics.cpp | 2 +- llvm/utils/TableGen/CodeEmitterGen.cpp | 2 +- llvm/utils/TableGen/CodeGenMapTable.cpp | 4 +- .../TableGen/Common/CodeGenDAGPatterns.cpp | 61 +- .../TableGen/Common/CodeGenDAGPatterns.h | 10 +- .../TableGen/Common/CodeGenInstAlias.cpp | 4 +- llvm/utils/TableGen/Common/CodeGenInstAlias.h | 2 +- .../TableGen/Common/CodeGenInstruction.cpp | 22 +- .../TableGen/Common/CodeGenInstruction.h | 4 +- .../TableGen/Common/CodeGenRegisters.cpp | 16 +- .../utils/TableGen/Common/CodeGenSchedule.cpp | 9 +- llvm/utils/TableGen/Common/CodeGenTarget.cpp | 10 +- .../Common/GlobalISel/PatternParser.cpp | 4 +- .../TableGen/Common/GlobalISel/Patterns.cpp | 2 +- .../TableGen/Common/VarLenCodeEmitterGen.cpp | 11 +- llvm/utils/TableGen/CompressInstEmitter.cpp | 7 +- llvm/utils/TableGen/DAGISelMatcherGen.cpp | 2 +- llvm/utils/TableGen/DFAEmitter.cpp | 2 +- llvm/utils/TableGen/DXILEmitter.cpp | 2 +- llvm/utils/TableGen/DecoderEmitter.cpp | 36 +- .../TableGen/GlobalISelCombinerEmitter.cpp | 2 +- llvm/utils/TableGen/GlobalISelEmitter.cpp | 4 +- llvm/utils/TableGen/InstrInfoEmitter.cpp | 4 +- llvm/utils/TableGen/OptionParserEmitter.cpp | 10 +- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 6 +- .../utils/TableGen/SearchableTableEmitter.cpp | 9 +- llvm/utils/TableGen/X86FoldTablesEmitter.cpp | 6 +- .../utils/TableGen/X86InstrMappingEmitter.cpp | 2 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 8 +- mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp | 3 +- 41 files changed, 1144 insertions(+), 1097 deletions(-) diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp index b67c5d1d1146..ba8840c1bdca 100644 --- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp +++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp @@ -367,13 +367,13 @@ void emitOption(const DocumentedOption &Option, const Record *DocInfo, for (const Record *VisibilityHelp : R->getValueAsListOfDefs("HelpTextsForVariants")) { // This is a list of visibilities. - ArrayRef Visibilities = + ArrayRef Visibilities = VisibilityHelp->getValueAsListInit("Visibilities")->getValues(); // See if any of the program's visibilities are in the list. for (StringRef DocInfoMask : DocInfo->getValueAsListOfStrings("VisibilityMask")) { - for (Init *Visibility : Visibilities) { + for (const Init *Visibility : Visibilities) { if (Visibility->getAsUnquotedString() == DocInfoMask) { // Use the first one we find. Description = escapeRST(VisibilityHelp->getValueAsString("Text")); diff --git a/llvm/include/llvm/TableGen/Error.h b/llvm/include/llvm/TableGen/Error.h index 512249b0160c..b963dcba9869 100644 --- a/llvm/include/llvm/TableGen/Error.h +++ b/llvm/include/llvm/TableGen/Error.h @@ -49,8 +49,8 @@ void PrintError(const RecordVal *RecVal, const Twine &Msg); [[noreturn]] void PrintFatalError(function_ref PrintMsg); // Returns true if the assert failed. -bool CheckAssert(SMLoc Loc, Init *Condition, Init *Message); -void dumpMessage(SMLoc Loc, Init *Message); +bool CheckAssert(SMLoc Loc, const Init *Condition, const Init *Message); +void dumpMessage(SMLoc Loc, const Init *Message); extern SourceMgr SrcMgr; extern unsigned ErrorsPrinted; diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index f856ff4cbd34..63267b7633f6 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -374,25 +374,26 @@ public: /// If this value is convertible to type \p Ty, return a value whose /// type is \p Ty, generating a !cast operation if required. /// Otherwise, return null. - virtual Init *getCastTo(const RecTy *Ty) const = 0; + virtual const Init *getCastTo(const RecTy *Ty) const = 0; /// Convert to a value whose type is \p Ty, or return null if this /// is not possible. This can happen if the value's type is convertible /// to \p Ty, but there are unresolved references. - virtual Init *convertInitializerTo(const RecTy *Ty) const = 0; + virtual const Init *convertInitializerTo(const RecTy *Ty) const = 0; /// This function is used to implement the bit range /// selection operator. Given a value, it selects the specified bits, /// returning them as a new \p Init of type \p bits. If it is not legal /// to use the bit selection operator on this value, null is returned. - virtual Init *convertInitializerBitRange(ArrayRef Bits) const { + virtual const Init * + convertInitializerBitRange(ArrayRef Bits) const { return nullptr; } /// This function is used to implement the FieldInit class. /// Implementors of this method should return the type of the named /// field if they are of type record. - virtual const RecTy *getFieldType(StringInit *FieldName) const { + virtual const RecTy *getFieldType(const StringInit *FieldName) const { return nullptr; } @@ -400,12 +401,12 @@ public: /// variables which may not be defined at the time the expression is formed. /// If a value is set for the variable later, this method will be called on /// users of the value to allow the value to propagate out. - virtual Init *resolveReferences(Resolver &R) const { + virtual const Init *resolveReferences(Resolver &R) const { return const_cast(this); } /// Get the \p Init value of the specified bit. - virtual Init *getBit(unsigned Bit) const = 0; + virtual const Init *getBit(unsigned Bit) const = 0; }; inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) { @@ -436,15 +437,16 @@ public: /// Get the record keeper that initialized this Init. RecordKeeper &getRecordKeeper() const { return ValueTy->getRecordKeeper(); } - Init *getCastTo(const RecTy *Ty) const override; - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *getCastTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; - Init *convertInitializerBitRange(ArrayRef Bits) const override; + const Init * + convertInitializerBitRange(ArrayRef Bits) const override; /// This method is used to implement the FieldInit class. /// Implementors of this method should return the type of the named field if /// they are of type record. - const RecTy *getFieldType(StringInit *FieldName) const override; + const RecTy *getFieldType(const StringInit *FieldName) const override; }; /// '?' - Represents an uninitialized value. @@ -470,10 +472,10 @@ public: /// Get the record keeper that initialized this Init. RecordKeeper &getRecordKeeper() const { return RK; } - Init *getCastTo(const RecTy *Ty) const override; - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *getCastTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { return const_cast(this); } @@ -487,7 +489,7 @@ public: }; // Represent an argument. -using ArgAuxType = std::variant; +using ArgAuxType = std::variant; class ArgumentInit : public Init, public FoldingSetNode { public: enum Kind { @@ -496,11 +498,11 @@ public: }; private: - Init *Value; + const Init *Value; ArgAuxType Aux; protected: - explicit ArgumentInit(Init *Value, ArgAuxType Aux) + explicit ArgumentInit(const Init *Value, ArgAuxType Aux) : Init(IK_ArgumentInit), Value(Value), Aux(Aux) {} public: @@ -511,25 +513,27 @@ public: RecordKeeper &getRecordKeeper() const { return Value->getRecordKeeper(); } - static ArgumentInit *get(Init *Value, ArgAuxType Aux); + static const ArgumentInit *get(const Init *Value, ArgAuxType Aux); bool isPositional() const { return Aux.index() == Positional; } bool isNamed() const { return Aux.index() == Named; } - Init *getValue() const { return Value; } + const Init *getValue() const { return Value; } unsigned getIndex() const { assert(isPositional() && "Should be positional!"); return std::get(Aux); } - Init *getName() const { + const Init *getName() const { assert(isNamed() && "Should be named!"); return std::get(Aux); } - ArgumentInit *cloneWithValue(Init *Value) const { return get(Value, Aux); } + const ArgumentInit *cloneWithValue(const Init *Value) const { + return get(Value, Aux); + } void Profile(FoldingSetNodeID &ID) const; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; std::string getAsString() const override { if (isPositional()) return utostr(getIndex()) + ": " + Value->getAsString(); @@ -541,11 +545,11 @@ public: bool isComplete() const override { return false; } bool isConcrete() const override { return false; } - Init *getBit(unsigned Bit) const override { return Value->getBit(Bit); } - Init *getCastTo(const RecTy *Ty) const override { + const Init *getBit(unsigned Bit) const override { return Value->getBit(Bit); } + const Init *getCastTo(const RecTy *Ty) const override { return Value->getCastTo(Ty); } - Init *convertInitializerTo(const RecTy *Ty) const override { + const Init *convertInitializerTo(const RecTy *Ty) const override { return Value->convertInitializerTo(Ty); } }; @@ -571,9 +575,9 @@ public: bool getValue() const { return Value; } - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { assert(Bit < 1 && "Bit index out of range!"); return const_cast(this); } @@ -584,8 +588,9 @@ public: /// '{ a, b, c }' - Represents an initializer for a BitsRecTy value. /// It contains a vector of bits, whose size is determined by the type. -class BitsInit final : public TypedInit, public FoldingSetNode, - public TrailingObjects { +class BitsInit final : public TypedInit, + public FoldingSetNode, + public TrailingObjects { unsigned NumBits; BitsInit(RecordKeeper &RK, unsigned N) @@ -602,14 +607,15 @@ public: return I->getKind() == IK_BitsInit; } - static BitsInit *get(RecordKeeper &RK, ArrayRef Range); + static BitsInit *get(RecordKeeper &RK, ArrayRef Range); void Profile(FoldingSetNodeID &ID) const; unsigned getNumBits() const { return NumBits; } - Init *convertInitializerTo(const RecTy *Ty) const override; - Init *convertInitializerBitRange(ArrayRef Bits) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; + const Init * + convertInitializerBitRange(ArrayRef Bits) const override; std::optional convertInitializerToInt() const; bool isComplete() const override { @@ -627,11 +633,11 @@ public: bool isConcrete() const override; std::string getAsString() const override; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { assert(Bit < NumBits && "Bit index out of range!"); - return getTrailingObjects()[Bit]; + return getTrailingObjects()[Bit]; } }; @@ -654,13 +660,14 @@ public: int64_t getValue() const { return Value; } - Init *convertInitializerTo(const RecTy *Ty) const override; - Init *convertInitializerBitRange(ArrayRef Bits) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; + const Init * + convertInitializerBitRange(ArrayRef Bits) const override; bool isConcrete() const override { return true; } std::string getAsString() const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { return BitInit::get(getRecordKeeper(), (Value & (1ULL << Bit)) != 0); } }; @@ -684,13 +691,13 @@ public: unsigned getValue() const { return Value; } - StringInit *getNameInit() const; + const StringInit *getNameInit() const; std::string getAsString() const override; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off string"); } }; @@ -718,8 +725,8 @@ public: return I->getKind() == IK_StringInit; } - static StringInit *get(RecordKeeper &RK, StringRef, - StringFormat Fmt = SF_String); + static const StringInit *get(RecordKeeper &RK, StringRef, + StringFormat Fmt = SF_String); static StringFormat determineFormat(StringFormat Fmt1, StringFormat Fmt2) { return (Fmt1 == SF_Code || Fmt2 == SF_Code) ? SF_Code : SF_String; @@ -729,7 +736,7 @@ public: StringFormat getFormat() const { return Format; } bool hasCodeFormat() const { return Format == SF_Code; } - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; bool isConcrete() const override { return true; } @@ -744,19 +751,20 @@ public: return std::string(Value); } - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off string"); } }; /// [AL, AH, CL] - Represent a list of defs /// -class ListInit final : public TypedInit, public FoldingSetNode, - public TrailingObjects { +class ListInit final : public TypedInit, + public FoldingSetNode, + public TrailingObjects { unsigned NumValues; public: - using const_iterator = Init *const *; + using const_iterator = const Init *const *; private: explicit ListInit(unsigned N, const RecTy *EltTy) @@ -772,13 +780,13 @@ public: static bool classof(const Init *I) { return I->getKind() == IK_ListInit; } - static ListInit *get(ArrayRef Range, const RecTy *EltTy); + static const ListInit *get(ArrayRef Range, const RecTy *EltTy); void Profile(FoldingSetNodeID &ID) const; - Init *getElement(unsigned i) const { + const Init *getElement(unsigned i) const { assert(i < NumValues && "List element index out of range!"); - return getTrailingObjects()[i]; + return getTrailingObjects()[i]; } const RecTy *getElementType() const { return cast(getType())->getElementType(); @@ -786,30 +794,30 @@ public: const Record *getElementAsRecord(unsigned i) const; - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; /// This method is used by classes that refer to other /// variables which may not be defined at the time they expression is formed. /// If a value is set for the variable later, this method will be called on /// users of the value to allow the value to propagate out. /// - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; bool isComplete() const override; bool isConcrete() const override; std::string getAsString() const override; - ArrayRef getValues() const { - return ArrayRef(getTrailingObjects(), NumValues); + ArrayRef getValues() const { + return ArrayRef(getTrailingObjects(), NumValues); } - const_iterator begin() const { return getTrailingObjects(); } + const_iterator begin() const { return getTrailingObjects(); } const_iterator end () const { return begin() + NumValues; } size_t size () const { return NumValues; } bool empty() const { return NumValues == 0; } - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off list"); } }; @@ -831,12 +839,12 @@ public: } // Clone - Clone this operator, replacing arguments with the new list - virtual OpInit *clone(ArrayRef Operands) const = 0; + virtual const OpInit *clone(ArrayRef Operands) const = 0; virtual unsigned getNumOperands() const = 0; - virtual Init *getOperand(unsigned i) const = 0; + virtual const Init *getOperand(unsigned i) const = 0; - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; }; /// !op (X) - Transform an init. @@ -859,9 +867,9 @@ public: }; private: - Init *LHS; + const Init *LHS; - UnOpInit(UnaryOp opc, Init *lhs, const RecTy *Type) + UnOpInit(UnaryOp opc, const Init *lhs, const RecTy *Type) : OpInit(IK_UnOpInit, Type, opc), LHS(lhs) {} public: @@ -872,12 +880,12 @@ public: return I->getKind() == IK_UnOpInit; } - static UnOpInit *get(UnaryOp opc, Init *lhs, const RecTy *Type); + static const UnOpInit *get(UnaryOp opc, const Init *lhs, const RecTy *Type); void Profile(FoldingSetNodeID &ID) const; // Clone - Clone this operator, replacing arguments with the new list - OpInit *clone(ArrayRef Operands) const override { + const OpInit *clone(ArrayRef Operands) const override { assert(Operands.size() == 1 && "Wrong number of operands for unary operation"); return UnOpInit::get(getOpcode(), *Operands.begin(), getType()); @@ -885,19 +893,19 @@ public: unsigned getNumOperands() const override { return 1; } - Init *getOperand(unsigned i) const override { + const Init *getOperand(unsigned i) const override { assert(i == 0 && "Invalid operand id for unary operator"); return getOperand(); } UnaryOp getOpcode() const { return (UnaryOp)Opc; } - Init *getOperand() const { return LHS; } + const Init *getOperand() const { return LHS; } // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold(Record *CurRec, bool IsFinal = false) const; + const Init *Fold(const Record *CurRec, bool IsFinal = false) const; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; std::string getAsString() const override; }; @@ -937,9 +945,9 @@ public: }; private: - Init *LHS, *RHS; + const Init *LHS, *RHS; - BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, const RecTy *Type) + BinOpInit(BinaryOp opc, const Init *lhs, const Init *rhs, const RecTy *Type) : OpInit(IK_BinOpInit, Type, opc), LHS(lhs), RHS(rhs) {} public: @@ -950,21 +958,22 @@ public: return I->getKind() == IK_BinOpInit; } - static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs, const RecTy *Type); - static Init *getStrConcat(Init *lhs, Init *rhs); - static Init *getListConcat(TypedInit *lhs, Init *rhs); + static const BinOpInit *get(BinaryOp opc, const Init *lhs, const Init *rhs, + const RecTy *Type); + static const Init *getStrConcat(const Init *lhs, const Init *rhs); + static const Init *getListConcat(const TypedInit *lhs, const Init *rhs); void Profile(FoldingSetNodeID &ID) const; // Clone - Clone this operator, replacing arguments with the new list - OpInit *clone(ArrayRef Operands) const override { + const OpInit *clone(ArrayRef Operands) const override { assert(Operands.size() == 2 && "Wrong number of operands for binary operation"); return BinOpInit::get(getOpcode(), Operands[0], Operands[1], getType()); } unsigned getNumOperands() const override { return 2; } - Init *getOperand(unsigned i) const override { + const Init *getOperand(unsigned i) const override { switch (i) { default: llvm_unreachable("Invalid operand id for binary operator"); case 0: return getLHS(); @@ -973,16 +982,17 @@ public: } BinaryOp getOpcode() const { return (BinaryOp)Opc; } - Init *getLHS() const { return LHS; } - Init *getRHS() const { return RHS; } + const Init *getLHS() const { return LHS; } + const Init *getRHS() const { return RHS; } - std::optional CompareInit(unsigned Opc, Init *LHS, Init *RHS) const; + std::optional CompareInit(unsigned Opc, const Init *LHS, + const Init *RHS) const; // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold(Record *CurRec) const; + const Init *Fold(const Record *CurRec) const; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; std::string getAsString() const override; }; @@ -1004,9 +1014,10 @@ public: }; private: - Init *LHS, *MHS, *RHS; + const Init *LHS, *MHS, *RHS; - TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs, const RecTy *Type) + TernOpInit(TernaryOp opc, const Init *lhs, const Init *mhs, const Init *rhs, + const RecTy *Type) : OpInit(IK_TernOpInit, Type, opc), LHS(lhs), MHS(mhs), RHS(rhs) {} public: @@ -1017,13 +1028,13 @@ public: return I->getKind() == IK_TernOpInit; } - static TernOpInit *get(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs, - const RecTy *Type); + static const TernOpInit *get(TernaryOp opc, const Init *lhs, const Init *mhs, + const Init *rhs, const RecTy *Type); void Profile(FoldingSetNodeID &ID) const; // Clone - Clone this operator, replacing arguments with the new list - OpInit *clone(ArrayRef Operands) const override { + const OpInit *clone(ArrayRef Operands) const override { assert(Operands.size() == 3 && "Wrong number of operands for ternary operation"); return TernOpInit::get(getOpcode(), Operands[0], Operands[1], Operands[2], @@ -1031,7 +1042,7 @@ public: } unsigned getNumOperands() const override { return 3; } - Init *getOperand(unsigned i) const override { + const Init *getOperand(unsigned i) const override { switch (i) { default: llvm_unreachable("Invalid operand id for ternary operator"); case 0: return getLHS(); @@ -1041,19 +1052,19 @@ public: } TernaryOp getOpcode() const { return (TernaryOp)Opc; } - Init *getLHS() const { return LHS; } - Init *getMHS() const { return MHS; } - Init *getRHS() const { return RHS; } + const Init *getLHS() const { return LHS; } + const Init *getMHS() const { return MHS; } + const Init *getRHS() const { return RHS; } // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold(Record *CurRec) const; + const Init *Fold(const Record *CurRec) const; bool isComplete() const override { return LHS->isComplete() && MHS->isComplete() && RHS->isComplete(); } - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; std::string getAsString() const override; }; @@ -1061,8 +1072,9 @@ public: /// !cond(condition_1: value1, ... , condition_n: value) /// Selects the first value for which condition is true. /// Otherwise reports an error. -class CondOpInit final : public TypedInit, public FoldingSetNode, - public TrailingObjects { +class CondOpInit final : public TypedInit, + public FoldingSetNode, + public TrailingObjects { unsigned NumConds; const RecTy *ValType; @@ -1081,8 +1093,8 @@ public: return I->getKind() == IK_CondOpInit; } - static CondOpInit *get(ArrayRef C, ArrayRef V, - const RecTy *Type); + static const CondOpInit *get(ArrayRef C, + ArrayRef V, const RecTy *Type); void Profile(FoldingSetNodeID &ID) const; @@ -1090,34 +1102,34 @@ public: unsigned getNumConds() const { return NumConds; } - Init *getCond(unsigned Num) const { + const Init *getCond(unsigned Num) const { assert(Num < NumConds && "Condition number out of range!"); - return getTrailingObjects()[Num]; + return getTrailingObjects()[Num]; } - Init *getVal(unsigned Num) const { + const Init *getVal(unsigned Num) const { assert(Num < NumConds && "Val number out of range!"); - return getTrailingObjects()[Num+NumConds]; + return getTrailingObjects()[Num + NumConds]; } - ArrayRef getConds() const { - return ArrayRef(getTrailingObjects(), NumConds); + ArrayRef getConds() const { + return ArrayRef(getTrailingObjects(), NumConds); } - ArrayRef getVals() const { - return ArrayRef(getTrailingObjects() + NumConds, NumConds); + ArrayRef getVals() const { + return ArrayRef(getTrailingObjects() + NumConds, NumConds); } - Init *Fold(Record *CurRec) const; + const Init *Fold(const Record *CurRec) const; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; bool isConcrete() const override; bool isComplete() const override; std::string getAsString() const override; - using const_case_iterator = SmallVectorImpl::const_iterator; - using const_val_iterator = SmallVectorImpl::const_iterator; + using const_case_iterator = SmallVectorImpl::const_iterator; + using const_val_iterator = SmallVectorImpl::const_iterator; inline const_case_iterator arg_begin() const { return getConds().begin(); } inline const_case_iterator arg_end () const { return getConds().end(); } @@ -1131,20 +1143,16 @@ public: inline size_t val_size () const { return NumConds; } inline bool val_empty() const { return NumConds == 0; } - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; }; /// !foldl (a, b, expr, start, lst) - Fold over a list. class FoldOpInit : public TypedInit, public FoldingSetNode { private: - Init *Start; - Init *List; - Init *A; - Init *B; - Init *Expr; + const Init *Start, *List, *A, *B, *Expr; - FoldOpInit(Init *Start, Init *List, Init *A, Init *B, Init *Expr, - const RecTy *Type) + FoldOpInit(const Init *Start, const Init *List, const Init *A, const Init *B, + const Init *Expr, const RecTy *Type) : TypedInit(IK_FoldOpInit, Type), Start(Start), List(List), A(A), B(B), Expr(Expr) {} @@ -1154,20 +1162,21 @@ public: static bool classof(const Init *I) { return I->getKind() == IK_FoldOpInit; } - static FoldOpInit *get(Init *Start, Init *List, Init *A, Init *B, Init *Expr, - const RecTy *Type); + static const FoldOpInit *get(const Init *Start, const Init *List, + const Init *A, const Init *B, const Init *Expr, + const RecTy *Type); void Profile(FoldingSetNodeID &ID) const; // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold(Record *CurRec) const; + const Init *Fold(const Record *CurRec) const; bool isComplete() const override { return false; } - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; std::string getAsString() const override; }; @@ -1176,9 +1185,9 @@ public: class IsAOpInit : public TypedInit, public FoldingSetNode { private: const RecTy *CheckType; - Init *Expr; + const Init *Expr; - IsAOpInit(const RecTy *CheckType, Init *Expr) + IsAOpInit(const RecTy *CheckType, const Init *Expr) : TypedInit(IK_IsAOpInit, IntRecTy::get(CheckType->getRecordKeeper())), CheckType(CheckType), Expr(Expr) {} @@ -1188,19 +1197,19 @@ public: static bool classof(const Init *I) { return I->getKind() == IK_IsAOpInit; } - static IsAOpInit *get(const RecTy *CheckType, Init *Expr); + static const IsAOpInit *get(const RecTy *CheckType, const Init *Expr); void Profile(FoldingSetNodeID &ID) const; // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold() const; + const Init *Fold() const; bool isComplete() const override { return false; } - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; std::string getAsString() const override; }; @@ -1210,9 +1219,9 @@ public: class ExistsOpInit : public TypedInit, public FoldingSetNode { private: const RecTy *CheckType; - Init *Expr; + const Init *Expr; - ExistsOpInit(const RecTy *CheckType, Init *Expr) + ExistsOpInit(const RecTy *CheckType, const Init *Expr) : TypedInit(IK_ExistsOpInit, IntRecTy::get(CheckType->getRecordKeeper())), CheckType(CheckType), Expr(Expr) {} @@ -1222,28 +1231,28 @@ public: static bool classof(const Init *I) { return I->getKind() == IK_ExistsOpInit; } - static ExistsOpInit *get(const RecTy *CheckType, Init *Expr); + static const ExistsOpInit *get(const RecTy *CheckType, const Init *Expr); void Profile(FoldingSetNodeID &ID) const; // Fold - If possible, fold this to a simpler init. Return this if not // possible to fold. - Init *Fold(Record *CurRec, bool IsFinal = false) const; + const Init *Fold(const Record *CurRec, bool IsFinal = false) const; bool isComplete() const override { return false; } - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; std::string getAsString() const override; }; /// 'Opcode' - Represent a reference to an entire variable object. class VarInit : public TypedInit { - Init *VarName; + const Init *VarName; - explicit VarInit(Init *VN, const RecTy *T) + explicit VarInit(const Init *VN, const RecTy *T) : TypedInit(IK_VarInit, T), VarName(VN) {} public: @@ -1254,11 +1263,11 @@ public: return I->getKind() == IK_VarInit; } - static VarInit *get(StringRef VN, const RecTy *T); - static VarInit *get(Init *VN, const RecTy *T); + static const VarInit *get(StringRef VN, const RecTy *T); + static const VarInit *get(const Init *VN, const RecTy *T); StringRef getName() const; - Init *getNameInit() const { return VarName; } + const Init *getNameInit() const { return VarName; } std::string getNameInitAsString() const { return getNameInit()->getAsUnquotedString(); @@ -1269,19 +1278,19 @@ public: /// If a value is set for the variable later, this method will be called on /// users of the value to allow the value to propagate out. /// - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; std::string getAsString() const override { return std::string(getName()); } }; /// Opcode{0} - Represent access to one bit of a variable or field. class VarBitInit final : public TypedInit { - TypedInit *TI; + const TypedInit *TI; unsigned Bit; - VarBitInit(TypedInit *T, unsigned B) + VarBitInit(const TypedInit *T, unsigned B) : TypedInit(IK_VarBitInit, BitRecTy::get(T->getRecordKeeper())), TI(T), Bit(B) { assert(T->getType() && @@ -1299,15 +1308,15 @@ public: return I->getKind() == IK_VarBitInit; } - static VarBitInit *get(TypedInit *T, unsigned B); + static const VarBitInit *get(const TypedInit *T, unsigned B); - Init *getBitVar() const { return TI; } + const Init *getBitVar() const { return TI; } unsigned getBitNum() const { return Bit; } std::string getAsString() const override; - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; - Init *getBit(unsigned B) const override { + const Init *getBit(unsigned B) const override { assert(B < 1 && "Bit index out of range!"); return const_cast(this); } @@ -1329,33 +1338,34 @@ public: return I->getKind() == IK_DefInit; } - Init *convertInitializerTo(const RecTy *Ty) const override; + const Init *convertInitializerTo(const RecTy *Ty) const override; const Record *getDef() const { return Def; } - const RecTy *getFieldType(StringInit *FieldName) const override; + const RecTy *getFieldType(const StringInit *FieldName) const override; bool isConcrete() const override { return true; } std::string getAsString() const override; - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off def"); } }; /// classname - Represent an uninstantiated anonymous class /// instantiation. -class VarDefInit final : public TypedInit, - public FoldingSetNode, - public TrailingObjects { +class VarDefInit final + : public TypedInit, + public FoldingSetNode, + public TrailingObjects { SMLoc Loc; Record *Class; - DefInit *Def = nullptr; // after instantiation + const DefInit *Def = nullptr; // after instantiation unsigned NumArgs; explicit VarDefInit(SMLoc Loc, Record *Class, unsigned N); - DefInit *instantiate(); + const DefInit *instantiate(); public: VarDefInit(const VarDefInit &) = delete; @@ -1367,46 +1377,46 @@ public: static bool classof(const Init *I) { return I->getKind() == IK_VarDefInit; } - static VarDefInit *get(SMLoc Loc, Record *Class, - ArrayRef Args); + static const VarDefInit *get(SMLoc Loc, Record *Class, + ArrayRef Args); void Profile(FoldingSetNodeID &ID) const; - Init *resolveReferences(Resolver &R) const override; - Init *Fold() const; + const Init *resolveReferences(Resolver &R) const override; + const Init *Fold() const; std::string getAsString() const override; - ArgumentInit *getArg(unsigned i) const { + const ArgumentInit *getArg(unsigned i) const { assert(i < NumArgs && "Argument index out of range!"); - return getTrailingObjects()[i]; + return getTrailingObjects()[i]; } - using const_iterator = ArgumentInit *const *; + using const_iterator = const ArgumentInit *const *; const_iterator args_begin() const { - return getTrailingObjects(); + return getTrailingObjects(); } const_iterator args_end () const { return args_begin() + NumArgs; } size_t args_size () const { return NumArgs; } bool args_empty() const { return NumArgs == 0; } - ArrayRef args() const { + ArrayRef args() const { return ArrayRef(args_begin(), NumArgs); } - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off anonymous def"); } }; /// X.Y - Represent a reference to a subfield of a variable class FieldInit : public TypedInit { - Init *Rec; // Record we are referring to - StringInit *FieldName; // Field we are accessing + const Init *Rec; // Record we are referring to + const StringInit *FieldName; // Field we are accessing - FieldInit(Init *R, StringInit *FN) + FieldInit(const Init *R, const StringInit *FN) : TypedInit(IK_FieldInit, R->getFieldType(FN)), Rec(R), FieldName(FN) { #ifndef NDEBUG if (!getType()) { @@ -1426,15 +1436,15 @@ public: return I->getKind() == IK_FieldInit; } - static FieldInit *get(Init *R, StringInit *FN); + static const FieldInit *get(const Init *R, const StringInit *FN); - Init *getRecord() const { return Rec; } - StringInit *getFieldName() const { return FieldName; } + const Init *getRecord() const { return Rec; } + const StringInit *getFieldName() const { return FieldName; } - Init *getBit(unsigned Bit) const override; + const Init *getBit(unsigned Bit) const override; - Init *resolveReferences(Resolver &R) const override; - Init *Fold(Record *CurRec) const; + const Init *resolveReferences(Resolver &R) const override; + const Init *Fold(const Record *CurRec) const; bool isConcrete() const override; std::string getAsString() const override { @@ -1445,20 +1455,25 @@ public: /// (v a, b) - Represent a DAG tree value. DAG inits are required /// to have at least one value then a (possibly empty) list of arguments. Each /// argument can have a name associated with it. -class DagInit final : public TypedInit, public FoldingSetNode, - public TrailingObjects { +class DagInit final + : public TypedInit, + public FoldingSetNode, + public TrailingObjects { friend TrailingObjects; - Init *Val; - StringInit *ValName; + const Init *Val; + const StringInit *ValName; unsigned NumArgs; unsigned NumArgNames; - DagInit(Init *V, StringInit *VN, unsigned NumArgs, unsigned NumArgNames) + DagInit(const Init *V, const StringInit *VN, unsigned NumArgs, + unsigned NumArgNames) : TypedInit(IK_DagInit, DagRecTy::get(V->getRecordKeeper())), Val(V), ValName(VN), NumArgs(NumArgs), NumArgNames(NumArgNames) {} - size_t numTrailingObjects(OverloadToken) const { return NumArgs; } + size_t numTrailingObjects(OverloadToken) const { + return NumArgs; + } public: DagInit(const DagInit &) = delete; @@ -1468,17 +1483,19 @@ public: return I->getKind() == IK_DagInit; } - static DagInit *get(Init *V, StringInit *VN, ArrayRef ArgRange, - ArrayRef NameRange); - static DagInit *get(Init *V, StringInit *VN, - ArrayRef> Args); + static const DagInit *get(const Init *V, const StringInit *VN, + ArrayRef ArgRange, + ArrayRef NameRange); + static const DagInit * + get(const Init *V, const StringInit *VN, + ArrayRef> Args); void Profile(FoldingSetNodeID &ID) const; - Init *getOperator() const { return Val; } + const Init *getOperator() const { return Val; } const Record *getOperatorAsDef(ArrayRef Loc) const; - StringInit *getName() const { return ValName; } + const StringInit *getName() const { return ValName; } StringRef getNameStr() const { return ValName ? ValName->getValue() : StringRef(); @@ -1486,40 +1503,41 @@ public: unsigned getNumArgs() const { return NumArgs; } - Init *getArg(unsigned Num) const { + const Init *getArg(unsigned Num) const { assert(Num < NumArgs && "Arg number out of range!"); - return getTrailingObjects()[Num]; + return getTrailingObjects()[Num]; } /// This method looks up the specified argument name and returns its argument /// number or std::nullopt if that argument name does not exist. std::optional getArgNo(StringRef Name) const; - StringInit *getArgName(unsigned Num) const { + const StringInit *getArgName(unsigned Num) const { assert(Num < NumArgNames && "Arg number out of range!"); - return getTrailingObjects()[Num]; + return getTrailingObjects()[Num]; } StringRef getArgNameStr(unsigned Num) const { - StringInit *Init = getArgName(Num); + const StringInit *Init = getArgName(Num); return Init ? Init->getValue() : StringRef(); } - ArrayRef getArgs() const { - return ArrayRef(getTrailingObjects(), NumArgs); + ArrayRef getArgs() const { + return ArrayRef(getTrailingObjects(), NumArgs); } - ArrayRef getArgNames() const { - return ArrayRef(getTrailingObjects(), NumArgNames); + ArrayRef getArgNames() const { + return ArrayRef(getTrailingObjects(), NumArgNames); } - Init *resolveReferences(Resolver &R) const override; + const Init *resolveReferences(Resolver &R) const override; bool isConcrete() const override; std::string getAsString() const override; - using const_arg_iterator = SmallVectorImpl::const_iterator; - using const_name_iterator = SmallVectorImpl::const_iterator; + using const_arg_iterator = SmallVectorImpl::const_iterator; + using const_name_iterator = + SmallVectorImpl::const_iterator; inline const_arg_iterator arg_begin() const { return getArgs().begin(); } inline const_arg_iterator arg_end () const { return getArgs().end(); } @@ -1533,7 +1551,7 @@ public: inline size_t name_size () const { return NumArgNames; } inline bool name_empty() const { return NumArgNames == 0; } - Init *getBit(unsigned Bit) const override { + const Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off dag"); } }; @@ -1555,18 +1573,18 @@ public: }; private: - Init *Name; + const Init *Name; SMLoc Loc; // Source location of definition of name. PointerIntPair TyAndKind; - Init *Value; + const Init *Value; bool IsUsed = false; /// Reference locations to this record value. SmallVector ReferenceLocs; public: - RecordVal(Init *N, const RecTy *T, FieldKind K); - RecordVal(Init *N, SMLoc Loc, const RecTy *T, FieldKind K); + RecordVal(const Init *N, const RecTy *T, FieldKind K); + RecordVal(const Init *N, SMLoc Loc, const RecTy *T, FieldKind K); /// Get the record keeper used to unique this value. RecordKeeper &getRecordKeeper() const { return Name->getRecordKeeper(); } @@ -1575,7 +1593,7 @@ public: StringRef getName() const; /// Get the name of the field as an Init. - Init *getNameInit() const { return Name; } + const Init *getNameInit() const { return Name; } /// Get the name of the field as a std::string. std::string getNameInitAsString() const { @@ -1602,13 +1620,13 @@ public: std::string getPrintType() const; /// Get the value of the field as an Init. - Init *getValue() const { return Value; } + const Init *getValue() const { return Value; } /// Set the value of the field from an Init. - bool setValue(Init *V); + bool setValue(const Init *V); /// Set the value and source location of the field. - bool setValue(Init *V, SMLoc NewLoc); + bool setValue(const Init *V, SMLoc NewLoc); /// Add a reference to this record value. void addReferenceLoc(SMRange Loc) { ReferenceLocs.push_back(Loc); } @@ -1636,35 +1654,35 @@ class Record { public: struct AssertionInfo { SMLoc Loc; - Init *Condition; - Init *Message; + const Init *Condition; + const Init *Message; // User-defined constructor to support std::make_unique(). It can be // removed in C++20 when braced initialization is supported. - AssertionInfo(SMLoc Loc, Init *Condition, Init *Message) + AssertionInfo(SMLoc Loc, const Init *Condition, const Init *Message) : Loc(Loc), Condition(Condition), Message(Message) {} }; struct DumpInfo { SMLoc Loc; - Init *Message; + const Init *Message; // User-defined constructor to support std::make_unique(). It can be // removed in C++20 when braced initialization is supported. - DumpInfo(SMLoc Loc, Init *Message) : Loc(Loc), Message(Message) {} + DumpInfo(SMLoc Loc, const Init *Message) : Loc(Loc), Message(Message) {} }; enum RecordKind { RK_Def, RK_AnonymousDef, RK_Class, RK_MultiClass }; private: - Init *Name; + const Init *Name; // Location where record was instantiated, followed by the location of // multiclass prototypes used, and finally by the locations of references to // this record. SmallVector Locs; SmallVector ForwardDeclarationLocs; mutable SmallVector ReferenceLocs; - SmallVector TemplateArgs; + SmallVector TemplateArgs; SmallVector Values; SmallVector Assertions; SmallVector Dumps; @@ -1688,7 +1706,7 @@ private: public: // Constructs a record. - explicit Record(Init *N, ArrayRef locs, RecordKeeper &records, + explicit Record(const Init *N, ArrayRef locs, RecordKeeper &records, RecordKind Kind = RK_Def) : Name(N), Locs(locs), TrackedRecords(records), ID(getNewUID(N->getRecordKeeper())), Kind(Kind) { @@ -1714,15 +1732,13 @@ public: StringRef getName() const { return cast(Name)->getValue(); } - Init *getNameInit() const { - return Name; - } + const Init *getNameInit() const { return Name; } std::string getNameInitAsString() const { return getNameInit()->getAsUnquotedString(); } - void setName(Init *Name); // Also updates RecordKeeper. + void setName(const Init *Name); // Also updates RecordKeeper. ArrayRef getLoc() const { return Locs; } void appendLoc(SMLoc Loc) { Locs.push_back(Loc); } @@ -1752,9 +1768,7 @@ public: bool isAnonymous() const { return Kind == RK_AnonymousDef; } - ArrayRef getTemplateArgs() const { - return TemplateArgs; - } + ArrayRef getTemplateArgs() const { return TemplateArgs; } ArrayRef getValues() const { return Values; } @@ -1771,7 +1785,7 @@ public: /// Append the direct superclasses of this record to Classes. void getDirectSuperClasses(SmallVectorImpl &Classes) const; - bool isTemplateArg(Init *Name) const { + bool isTemplateArg(const Init *Name) const { return llvm::is_contained(TemplateArgs, Name); } @@ -1795,7 +1809,7 @@ public: static_cast(this)->getValue(Name)); } - void addTemplateArg(Init *Name) { + void addTemplateArg(const Init *Name) { assert(!isTemplateArg(Name) && "Template arg already defined!"); TemplateArgs.push_back(Name); } @@ -1805,7 +1819,7 @@ public: Values.push_back(RV); } - void removeValue(Init *Name) { + void removeValue(const Init *Name) { for (unsigned i = 0, e = Values.size(); i != e; ++i) if (Values[i].getNameInit() == Name) { Values.erase(Values.begin()+i); @@ -1818,11 +1832,11 @@ public: removeValue(StringInit::get(getRecords(), Name)); } - void addAssertion(SMLoc Loc, Init *Condition, Init *Message) { + void addAssertion(SMLoc Loc, const Init *Condition, const Init *Message) { Assertions.push_back(AssertionInfo(Loc, Condition, Message)); } - void addDump(SMLoc Loc, Init *Message) { + void addDump(SMLoc Loc, const Init *Message) { Dumps.push_back(DumpInfo(Loc, Message)); } @@ -1867,7 +1881,7 @@ public: /// /// This is a final resolve: any error messages, e.g. due to undefined !cast /// references, are generated now. - void resolveReferences(Init *NewName = nullptr); + void resolveReferences(const Init *NewName = nullptr); /// Apply the resolver to the name of the record as well as to the /// initializers of all fields of the record except SkipVal. @@ -1891,7 +1905,7 @@ public: /// Return the initializer for a value with the specified name, or throw an /// exception if the field does not exist. - Init *getValueInit(StringRef FieldName) const; + const Init *getValueInit(StringRef FieldName) const; /// Return true if the named field is unset. bool isValueUnset(StringRef FieldName) const { @@ -1911,12 +1925,12 @@ public: /// This method looks up the specified field and returns its value as a /// BitsInit, throwing an exception if the field does not exist or if the /// value is not the right type. - BitsInit *getValueAsBitsInit(StringRef FieldName) const; + const BitsInit *getValueAsBitsInit(StringRef FieldName) const; /// This method looks up the specified field and returns its value as a /// ListInit, throwing an exception if the field does not exist or if the /// value is not the right type. - ListInit *getValueAsListInit(StringRef FieldName) const; + const ListInit *getValueAsListInit(StringRef FieldName) const; /// This method looks up the specified field and returns its value as a /// vector of records, throwing an exception if the field does not exist or @@ -1961,14 +1975,14 @@ public: /// This method looks up the specified field and returns its value as an Dag, /// throwing an exception if the field does not exist or if the value is not /// the right type. - DagInit *getValueAsDag(StringRef FieldName) const; + const DagInit *getValueAsDag(StringRef FieldName) const; }; raw_ostream &operator<<(raw_ostream &OS, const Record &R); class RecordKeeper { using RecordMap = std::map, std::less<>>; - using GlobalMap = std::map>; + using GlobalMap = std::map>; public: RecordKeeper(); @@ -2002,7 +2016,7 @@ public: } /// Get the \p Init value of the specified global variable. - Init *getGlobal(StringRef Name) const { + const Init *getGlobal(StringRef Name) const { if (const Record *R = getDef(Name)) return R->getDefInit(); auto It = ExtraGlobals.find(Name); @@ -2027,14 +2041,14 @@ public: assert(Ins && "Record already exists"); } - void addExtraGlobal(StringRef Name, Init *I) { + void addExtraGlobal(StringRef Name, const Init *I) { bool Ins = ExtraGlobals.insert(std::make_pair(std::string(Name), I)).second; (void)Ins; assert(!getDef(Name)); assert(Ins && "Global already exists"); } - Init *getNewAnonymousName(); + const Init *getNewAnonymousName(); TGTimer &getTimer() const { return *Timer; } @@ -2190,18 +2204,18 @@ raw_ostream &operator<<(raw_ostream &OS, const RecordKeeper &RK); /// Interface for looking up the initializer for a variable name, used by /// Init::resolveReferences. class Resolver { - Record *CurRec; + const Record *CurRec; bool IsFinal = false; public: - explicit Resolver(Record *CurRec) : CurRec(CurRec) {} + explicit Resolver(const Record *CurRec) : CurRec(CurRec) {} virtual ~Resolver() = default; - Record *getCurrentRecord() const { return CurRec; } + const Record *getCurrentRecord() const { return CurRec; } /// Return the initializer for the given variable name (should normally be a /// StringInit), or nullptr if the name could not be resolved. - virtual Init *resolve(Init *VarName) = 0; + virtual const Init *resolve(const Init *VarName) = 0; // Whether bits in a BitsInit should stay unresolved if resolving them would // result in a ? (UnsetInit). This behavior is used to represent instruction @@ -2219,19 +2233,19 @@ public: /// Resolve arbitrary mappings. class MapResolver final : public Resolver { struct MappedValue { - Init *V; + const Init *V; bool Resolved; MappedValue() : V(nullptr), Resolved(false) {} - MappedValue(Init *V, bool Resolved) : V(V), Resolved(Resolved) {} + MappedValue(const Init *V, bool Resolved) : V(V), Resolved(Resolved) {} }; - DenseMap Map; + DenseMap Map; public: - explicit MapResolver(Record *CurRec = nullptr) : Resolver(CurRec) {} + explicit MapResolver(const Record *CurRec = nullptr) : Resolver(CurRec) {} - void set(Init *Key, Init *Value) { Map[Key] = {Value, false}; } + void set(const Init *Key, const Init *Value) { Map[Key] = {Value, false}; } bool isComplete(Init *VarName) const { auto It = Map.find(VarName); @@ -2239,21 +2253,21 @@ public: return It->second.V->isComplete(); } - Init *resolve(Init *VarName) override; + const Init *resolve(const Init *VarName) override; }; /// Resolve all variables from a record except for unset variables. class RecordResolver final : public Resolver { - DenseMap Cache; - SmallVector Stack; - Init *Name = nullptr; + DenseMap Cache; + SmallVector Stack; + const Init *Name = nullptr; public: - explicit RecordResolver(Record &R) : Resolver(&R) {} + explicit RecordResolver(const Record &R) : Resolver(&R) {} - void setName(Init *NewName) { Name = NewName; } + void setName(const Init *NewName) { Name = NewName; } - Init *resolve(Init *VarName) override; + const Init *resolve(const Init *VarName) override; bool keepUnsetBits() const override { return true; } }; @@ -2261,7 +2275,7 @@ public: /// Delegate resolving to a sub-resolver, but shadow some variable names. class ShadowResolver final : public Resolver { Resolver &R; - DenseSet Shadowed; + DenseSet Shadowed; public: explicit ShadowResolver(Resolver &R) @@ -2269,9 +2283,9 @@ public: setFinal(R.isFinal()); } - void addShadow(Init *Key) { Shadowed.insert(Key); } + void addShadow(const Init *Key) { Shadowed.insert(Key); } - Init *resolve(Init *VarName) override { + const Init *resolve(const Init *VarName) override { if (Shadowed.count(VarName)) return nullptr; return R.resolve(VarName); @@ -2290,22 +2304,22 @@ public: bool foundUnresolved() const { return FoundUnresolved; } - Init *resolve(Init *VarName) override; + const Init *resolve(const Init *VarName) override; }; /// Do not resolve anything, but keep track of whether a given variable was /// referenced. class HasReferenceResolver final : public Resolver { - Init *VarNameToTrack; + const Init *VarNameToTrack; bool Found = false; public: - explicit HasReferenceResolver(Init *VarNameToTrack) + explicit HasReferenceResolver(const Init *VarNameToTrack) : Resolver(nullptr), VarNameToTrack(VarNameToTrack) {} bool found() const { return Found; } - Init *resolve(Init *VarName) override; + const Init *resolve(const Init *VarName) override; }; void EmitDetailedRecords(const RecordKeeper &RK, raw_ostream &OS); diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp index 61fd3634902c..4a337248c941 100644 --- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp +++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp @@ -131,7 +131,7 @@ void DetailedRecordsEmitter::printDefms(const Record &Rec, raw_ostream &OS) { // Print the template arguments of a class. void DetailedRecordsEmitter::printTemplateArgs(const Record &Rec, raw_ostream &OS) { - ArrayRef Args = Rec.getTemplateArgs(); + ArrayRef Args = Rec.getTemplateArgs(); if (Args.empty()) { OS << " Template args: (none)\n"; return; diff --git a/llvm/lib/TableGen/Error.cpp b/llvm/lib/TableGen/Error.cpp index 6d1d5814223a..91423664a84c 100644 --- a/llvm/lib/TableGen/Error.cpp +++ b/llvm/lib/TableGen/Error.cpp @@ -160,7 +160,7 @@ void PrintFatalError(const RecordVal *RecVal, const Twine &Msg) { // Check an assertion: Obtain the condition value and be sure it is true. // If not, print a nonfatal error along with the message. -bool CheckAssert(SMLoc Loc, Init *Condition, Init *Message) { +bool CheckAssert(SMLoc Loc, const Init *Condition, const Init *Message) { auto *CondValue = dyn_cast_or_null(Condition->convertInitializerTo( IntRecTy::get(Condition->getRecordKeeper()))); if (!CondValue) { @@ -178,7 +178,7 @@ bool CheckAssert(SMLoc Loc, Init *Condition, Init *Message) { } // Dump a message to stderr. -void dumpMessage(SMLoc Loc, Init *Message) { +void dumpMessage(SMLoc Loc, const Init *Message) { if (auto *MessageInit = dyn_cast(Message)) PrintNote(Loc, MessageInit->getValue()); else diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 447ecb7d74d2..f8ea88375c48 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -74,8 +74,8 @@ struct RecordKeeperImpl { FoldingSet TheArgumentInitPool; FoldingSet TheBitsInitPool; std::map TheIntInitPool; - StringMap StringInitStringPool; - StringMap StringInitCodePool; + StringMap StringInitStringPool; + StringMap StringInitCodePool; FoldingSet TheListInitPool; FoldingSet TheUnOpInitPool; FoldingSet TheBinOpInitPool; @@ -83,10 +83,12 @@ struct RecordKeeperImpl { FoldingSet TheFoldOpInitPool; FoldingSet TheIsAOpInitPool; FoldingSet TheExistsOpInitPool; - DenseMap, VarInit *> TheVarInitPool; - DenseMap, VarBitInit *> TheVarBitInitPool; + DenseMap, VarInit *> TheVarInitPool; + DenseMap, VarBitInit *> + TheVarBitInitPool; FoldingSet TheVarDefInitPool; - DenseMap, FieldInit *> TheFieldInitPool; + DenseMap, FieldInit *> + TheFieldInitPool; FoldingSet TheCondOpInitPool; FoldingSet TheDagInitPool; FoldingSet RecordTypePool; @@ -389,15 +391,13 @@ UnsetInit *UnsetInit::get(RecordKeeper &RK) { return &RK.getImpl().TheUnsetInit; } -Init *UnsetInit::getCastTo(const RecTy *Ty) const { - return const_cast(this); -} +const Init *UnsetInit::getCastTo(const RecTy *Ty) const { return this; } -Init *UnsetInit::convertInitializerTo(const RecTy *Ty) const { - return const_cast(this); +const Init *UnsetInit::convertInitializerTo(const RecTy *Ty) const { + return this; } -static void ProfileArgumentInit(FoldingSetNodeID &ID, Init *Value, +static void ProfileArgumentInit(FoldingSetNodeID &ID, const Init *Value, ArgAuxType Aux) { auto I = Aux.index(); ID.AddInteger(I); @@ -412,14 +412,15 @@ void ArgumentInit::Profile(FoldingSetNodeID &ID) const { ProfileArgumentInit(ID, Value, Aux); } -ArgumentInit *ArgumentInit::get(Init *Value, ArgAuxType Aux) { +const ArgumentInit *ArgumentInit::get(const Init *Value, ArgAuxType Aux) { FoldingSetNodeID ID; ProfileArgumentInit(ID, Value, Aux); RecordKeeper &RK = Value->getRecordKeeper(); detail::RecordKeeperImpl &RKImpl = RK.getImpl(); void *IP = nullptr; - if (ArgumentInit *I = RKImpl.TheArgumentInitPool.FindNodeOrInsertPos(ID, IP)) + if (const ArgumentInit *I = + RKImpl.TheArgumentInitPool.FindNodeOrInsertPos(ID, IP)) return I; ArgumentInit *I = new (RKImpl.Allocator) ArgumentInit(Value, Aux); @@ -427,8 +428,8 @@ ArgumentInit *ArgumentInit::get(Init *Value, ArgAuxType Aux) { return I; } -Init *ArgumentInit::resolveReferences(Resolver &R) const { - Init *NewValue = Value->resolveReferences(R); +const Init *ArgumentInit::resolveReferences(Resolver &R) const { + const Init *NewValue = Value->resolveReferences(R); if (NewValue != Value) return cloneWithValue(NewValue); @@ -439,7 +440,7 @@ BitInit *BitInit::get(RecordKeeper &RK, bool V) { return V ? &RK.getImpl().TrueBitInit : &RK.getImpl().FalseBitInit; } -Init *BitInit::convertInitializerTo(const RecTy *Ty) const { +const Init *BitInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) return const_cast(this); @@ -455,15 +456,15 @@ Init *BitInit::convertInitializerTo(const RecTy *Ty) const { return nullptr; } -static void -ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef Range) { +static void ProfileBitsInit(FoldingSetNodeID &ID, + ArrayRef Range) { ID.AddInteger(Range.size()); - for (Init *I : Range) + for (const Init *I : Range) ID.AddPointer(I); } -BitsInit *BitsInit::get(RecordKeeper &RK, ArrayRef Range) { +BitsInit *BitsInit::get(RecordKeeper &RK, ArrayRef Range) { FoldingSetNodeID ID; ProfileBitsInit(ID, Range); @@ -472,20 +473,20 @@ BitsInit *BitsInit::get(RecordKeeper &RK, ArrayRef Range) { if (BitsInit *I = RKImpl.TheBitsInitPool.FindNodeOrInsertPos(ID, IP)) return I; - void *Mem = RKImpl.Allocator.Allocate(totalSizeToAlloc(Range.size()), - alignof(BitsInit)); + void *Mem = RKImpl.Allocator.Allocate( + totalSizeToAlloc(Range.size()), alignof(BitsInit)); BitsInit *I = new (Mem) BitsInit(RK, Range.size()); std::uninitialized_copy(Range.begin(), Range.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); RKImpl.TheBitsInitPool.InsertNode(I, IP); return I; } void BitsInit::Profile(FoldingSetNodeID &ID) const { - ProfileBitsInit(ID, ArrayRef(getTrailingObjects(), NumBits)); + ProfileBitsInit(ID, ArrayRef(getTrailingObjects(), NumBits)); } -Init *BitsInit::convertInitializerTo(const RecTy *Ty) const { +const Init *BitsInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) { if (getNumBits() != 1) return nullptr; // Only accept if just one bit! return getBit(0); @@ -517,9 +518,9 @@ std::optional BitsInit::convertInitializerToInt() const { return Result; } -Init * +const Init * BitsInit::convertInitializerBitRange(ArrayRef Bits) const { - SmallVector NewBits(Bits.size()); + SmallVector NewBits(Bits.size()); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Bits[i] >= getNumBits()) @@ -541,7 +542,7 @@ std::string BitsInit::getAsString() const { std::string Result = "{ "; for (unsigned i = 0, e = getNumBits(); i != e; ++i) { if (i) Result += ", "; - if (Init *Bit = getBit(e-i-1)) + if (const Init *Bit = getBit(e - i - 1)) Result += Bit->getAsString(); else Result += "*"; @@ -551,18 +552,18 @@ std::string BitsInit::getAsString() const { // resolveReferences - If there are any field references that refer to fields // that have been filled in, we can propagate the values now. -Init *BitsInit::resolveReferences(Resolver &R) const { +const Init *BitsInit::resolveReferences(Resolver &R) const { bool Changed = false; - SmallVector NewBits(getNumBits()); + SmallVector NewBits(getNumBits()); - Init *CachedBitVarRef = nullptr; - Init *CachedBitVarResolved = nullptr; + const Init *CachedBitVarRef = nullptr; + const Init *CachedBitVarResolved = nullptr; for (unsigned i = 0, e = getNumBits(); i != e; ++i) { - Init *CurBit = getBit(i); - Init *NewBit = CurBit; + const Init *CurBit = getBit(i); + const Init *NewBit = CurBit; - if (VarBitInit *CurBitVar = dyn_cast(CurBit)) { + if (const VarBitInit *CurBitVar = dyn_cast(CurBit)) { if (CurBitVar->getBitVar() != CachedBitVarRef) { CachedBitVarRef = CurBitVar->getBitVar(); CachedBitVarResolved = CachedBitVarRef->resolveReferences(R); @@ -583,7 +584,7 @@ Init *BitsInit::resolveReferences(Resolver &R) const { if (Changed) return BitsInit::get(getRecordKeeper(), NewBits); - return const_cast(this); + return this; } IntInit *IntInit::get(RecordKeeper &RK, int64_t V) { @@ -603,7 +604,7 @@ static bool canFitInBitfield(int64_t Value, unsigned NumBits) { (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1); } -Init *IntInit::convertInitializerTo(const RecTy *Ty) const { +const Init *IntInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) return const_cast(this); @@ -619,7 +620,7 @@ Init *IntInit::convertInitializerTo(const RecTy *Ty) const { if (!canFitInBitfield(Value, BRT->getNumBits())) return nullptr; - SmallVector NewBits(BRT->getNumBits()); + SmallVector NewBits(BRT->getNumBits()); for (unsigned i = 0; i != BRT->getNumBits(); ++i) NewBits[i] = BitInit::get(getRecordKeeper(), Value & ((i < 64) ? (1LL << i) : 0)); @@ -630,9 +631,8 @@ Init *IntInit::convertInitializerTo(const RecTy *Ty) const { return nullptr; } -Init * -IntInit::convertInitializerBitRange(ArrayRef Bits) const { - SmallVector NewBits(Bits.size()); +const Init *IntInit::convertInitializerBitRange(ArrayRef Bits) const { + SmallVector NewBits(Bits.size()); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Bits[i] >= 64) @@ -648,7 +648,7 @@ AnonymousNameInit *AnonymousNameInit::get(RecordKeeper &RK, unsigned V) { return new (RK.getImpl().Allocator) AnonymousNameInit(RK, V); } -StringInit *AnonymousNameInit::getNameInit() const { +const StringInit *AnonymousNameInit::getNameInit() const { return StringInit::get(getRecordKeeper(), getAsString()); } @@ -656,7 +656,7 @@ std::string AnonymousNameInit::getAsString() const { return "anonymous_" + utostr(Value); } -Init *AnonymousNameInit::resolveReferences(Resolver &R) const { +const Init *AnonymousNameInit::resolveReferences(Resolver &R) const { auto *Old = const_cast(static_cast(this)); auto *New = R.resolve(Old); New = New ? New : Old; @@ -666,7 +666,8 @@ Init *AnonymousNameInit::resolveReferences(Resolver &R) const { return New; } -StringInit *StringInit::get(RecordKeeper &RK, StringRef V, StringFormat Fmt) { +const StringInit *StringInit::get(RecordKeeper &RK, StringRef V, + StringFormat Fmt) { detail::RecordKeeperImpl &RKImpl = RK.getImpl(); auto &InitMap = Fmt == SF_String ? RKImpl.StringInitStringPool : RKImpl.StringInitCodePool; @@ -676,39 +677,40 @@ StringInit *StringInit::get(RecordKeeper &RK, StringRef V, StringFormat Fmt) { return Entry.second; } -Init *StringInit::convertInitializerTo(const RecTy *Ty) const { +const Init *StringInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) return const_cast(this); return nullptr; } -static void ProfileListInit(FoldingSetNodeID &ID, ArrayRef Range, +static void ProfileListInit(FoldingSetNodeID &ID, ArrayRef Range, const RecTy *EltTy) { ID.AddInteger(Range.size()); ID.AddPointer(EltTy); - for (Init *I : Range) + for (const Init *I : Range) ID.AddPointer(I); } -ListInit *ListInit::get(ArrayRef Range, const RecTy *EltTy) { +const ListInit *ListInit::get(ArrayRef Range, + const RecTy *EltTy) { FoldingSetNodeID ID; ProfileListInit(ID, Range, EltTy); detail::RecordKeeperImpl &RK = EltTy->getRecordKeeper().getImpl(); void *IP = nullptr; - if (ListInit *I = RK.TheListInitPool.FindNodeOrInsertPos(ID, IP)) + if (const ListInit *I = RK.TheListInitPool.FindNodeOrInsertPos(ID, IP)) return I; assert(Range.empty() || !isa(Range[0]) || cast(Range[0])->getType()->typeIsConvertibleTo(EltTy)); - void *Mem = RK.Allocator.Allocate(totalSizeToAlloc(Range.size()), - alignof(ListInit)); + void *Mem = RK.Allocator.Allocate( + totalSizeToAlloc(Range.size()), alignof(ListInit)); ListInit *I = new (Mem) ListInit(Range.size(), EltTy); std::uninitialized_copy(Range.begin(), Range.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); RK.TheListInitPool.InsertNode(I, IP); return I; } @@ -719,20 +721,20 @@ void ListInit::Profile(FoldingSetNodeID &ID) const { ProfileListInit(ID, getValues(), EltTy); } -Init *ListInit::convertInitializerTo(const RecTy *Ty) const { +const Init *ListInit::convertInitializerTo(const RecTy *Ty) const { if (getType() == Ty) return const_cast(this); if (auto *LRT = dyn_cast(Ty)) { - SmallVector Elements; + SmallVector Elements; Elements.reserve(getValues().size()); // Verify that all of the elements of the list are subclasses of the // appropriate class! bool Changed = false; const RecTy *ElementType = LRT->getElementType(); - for (Init *I : getValues()) - if (Init *CI = I->convertInitializerTo(ElementType)) { + for (const Init *I : getValues()) + if (const Init *CI = I->convertInitializerTo(ElementType)) { Elements.push_back(CI); if (CI != I) Changed = true; @@ -749,30 +751,30 @@ Init *ListInit::convertInitializerTo(const RecTy *Ty) const { const Record *ListInit::getElementAsRecord(unsigned i) const { assert(i < NumValues && "List element index out of range!"); - DefInit *DI = dyn_cast(getElement(i)); + const DefInit *DI = dyn_cast(getElement(i)); if (!DI) PrintFatalError("Expected record in list!"); return DI->getDef(); } -Init *ListInit::resolveReferences(Resolver &R) const { - SmallVector Resolved; +const Init *ListInit::resolveReferences(Resolver &R) const { + SmallVector Resolved; Resolved.reserve(size()); bool Changed = false; - for (Init *CurElt : getValues()) { - Init *E = CurElt->resolveReferences(R); + for (const Init *CurElt : getValues()) { + const Init *E = CurElt->resolveReferences(R); Changed |= E != CurElt; Resolved.push_back(E); } if (Changed) return ListInit::get(Resolved, getElementType()); - return const_cast(this); + return this; } bool ListInit::isComplete() const { - for (Init *Element : *this) { + for (const Init *Element : *this) { if (!Element->isComplete()) return false; } @@ -780,7 +782,7 @@ bool ListInit::isComplete() const { } bool ListInit::isConcrete() const { - for (Init *Element : *this) { + for (const Init *Element : *this) { if (!Element->isConcrete()) return false; } @@ -790,7 +792,7 @@ bool ListInit::isConcrete() const { std::string ListInit::getAsString() const { std::string Result = "["; const char *sep = ""; - for (Init *Element : *this) { + for (const Init *Element : *this) { Result += sep; sep = ", "; Result += Element->getAsString(); @@ -798,26 +800,26 @@ std::string ListInit::getAsString() const { return Result + "]"; } -Init *OpInit::getBit(unsigned Bit) const { +const Init *OpInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) return const_cast(this); - return VarBitInit::get(const_cast(this), Bit); + return VarBitInit::get(this, Bit); } -static void ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, - const RecTy *Type) { +static void ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, + const Init *Op, const RecTy *Type) { ID.AddInteger(Opcode); ID.AddPointer(Op); ID.AddPointer(Type); } -UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, const RecTy *Type) { +const UnOpInit *UnOpInit::get(UnaryOp Opc, const Init *LHS, const RecTy *Type) { FoldingSetNodeID ID; ProfileUnOpInit(ID, Opc, LHS, Type); detail::RecordKeeperImpl &RK = Type->getRecordKeeper().getImpl(); void *IP = nullptr; - if (UnOpInit *I = RK.TheUnOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const UnOpInit *I = RK.TheUnOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; UnOpInit *I = new (RK.Allocator) UnOpInit(Opc, LHS, Type); @@ -829,7 +831,7 @@ void UnOpInit::Profile(FoldingSetNodeID &ID) const { ProfileUnOpInit(ID, getOpcode(), getOperand(), getType()); } -Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { +const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { RecordKeeper &RK = getRecordKeeper(); switch (getOpcode()) { case REPR: @@ -851,27 +853,27 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { } break; case TOLOWER: - if (StringInit *LHSs = dyn_cast(LHS)) + if (const StringInit *LHSs = dyn_cast(LHS)) return StringInit::get(RK, LHSs->getValue().lower()); break; case TOUPPER: - if (StringInit *LHSs = dyn_cast(LHS)) + if (const StringInit *LHSs = dyn_cast(LHS)) return StringInit::get(RK, LHSs->getValue().upper()); break; case CAST: if (isa(getType())) { - if (StringInit *LHSs = dyn_cast(LHS)) + if (const StringInit *LHSs = dyn_cast(LHS)) return LHSs; - if (DefInit *LHSd = dyn_cast(LHS)) + if (const DefInit *LHSd = dyn_cast(LHS)) return StringInit::get(RK, LHSd->getAsString()); - if (IntInit *LHSi = dyn_cast_or_null( + if (const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) return StringInit::get(RK, LHSi->getAsString()); } else if (isa(getType())) { - if (StringInit *Name = dyn_cast(LHS)) { + if (const StringInit *Name = dyn_cast(LHS)) { const Record *D = RK.getDef(Name->getValue()); if (!D && CurRec) { // Self-references are allowed, but their resolution is delayed until @@ -911,25 +913,25 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { } } - if (Init *NewInit = LHS->convertInitializerTo(getType())) + if (const Init *NewInit = LHS->convertInitializerTo(getType())) return NewInit; break; case NOT: - if (IntInit *LHSi = dyn_cast_or_null( + if (const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) return IntInit::get(RK, LHSi->getValue() ? 0 : 1); break; case HEAD: - if (ListInit *LHSl = dyn_cast(LHS)) { + if (const ListInit *LHSl = dyn_cast(LHS)) { assert(!LHSl->empty() && "Empty list in head"); return LHSl->getElement(0); } break; case TAIL: - if (ListInit *LHSl = dyn_cast(LHS)) { + if (const ListInit *LHSl = dyn_cast(LHS)) { assert(!LHSl->empty() && "Empty list in tail"); // Note the +1. We can't just pass the result of getValues() // directly. @@ -938,25 +940,25 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { break; case SIZE: - if (ListInit *LHSl = dyn_cast(LHS)) + if (const ListInit *LHSl = dyn_cast(LHS)) return IntInit::get(RK, LHSl->size()); - if (DagInit *LHSd = dyn_cast(LHS)) + if (const DagInit *LHSd = dyn_cast(LHS)) return IntInit::get(RK, LHSd->arg_size()); - if (StringInit *LHSs = dyn_cast(LHS)) + if (const StringInit *LHSs = dyn_cast(LHS)) return IntInit::get(RK, LHSs->getValue().size()); break; case EMPTY: - if (ListInit *LHSl = dyn_cast(LHS)) + if (const ListInit *LHSl = dyn_cast(LHS)) return IntInit::get(RK, LHSl->empty()); - if (DagInit *LHSd = dyn_cast(LHS)) + if (const DagInit *LHSd = dyn_cast(LHS)) return IntInit::get(RK, LHSd->arg_empty()); - if (StringInit *LHSs = dyn_cast(LHS)) + if (const StringInit *LHSs = dyn_cast(LHS)) return IntInit::get(RK, LHSs->getValue().empty()); break; case GETDAGOP: - if (DagInit *Dag = dyn_cast(LHS)) { + if (const DagInit *Dag = dyn_cast(LHS)) { // TI is not necessarily a def due to the late resolution in multiclasses, // but has to be a TypedInit. auto *TI = cast(Dag->getOperator()); @@ -972,7 +974,7 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { break; case LOG2: - if (IntInit *LHSi = dyn_cast_or_null( + if (const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) { int64_t LHSv = LHSi->getValue(); if (LHSv <= 0) { @@ -989,21 +991,22 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { break; case LISTFLATTEN: - if (ListInit *LHSList = dyn_cast(LHS)) { + if (const ListInit *LHSList = dyn_cast(LHS)) { const ListRecTy *InnerListTy = dyn_cast(LHSList->getElementType()); // list of non-lists, !listflatten() is a NOP. if (!InnerListTy) return LHS; - auto Flatten = [](ListInit *List) -> std::optional> { - std::vector Flattened; + auto Flatten = + [](const ListInit *List) -> std::optional> { + std::vector Flattened; // Concatenate elements of all the inner lists. - for (Init *InnerInit : List->getValues()) { - ListInit *InnerList = dyn_cast(InnerInit); + for (const Init *InnerInit : List->getValues()) { + const ListInit *InnerList = dyn_cast(InnerInit); if (!InnerList) return std::nullopt; - for (Init *InnerElem : InnerList->getValues()) + for (const Init *InnerElem : InnerList->getValues()) Flattened.push_back(InnerElem); }; return Flattened; @@ -1018,13 +1021,13 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { return const_cast(this); } -Init *UnOpInit::resolveReferences(Resolver &R) const { - Init *lhs = LHS->resolveReferences(R); +const Init *UnOpInit::resolveReferences(Resolver &R) const { + const Init *lhs = LHS->resolveReferences(R); if (LHS != lhs || (R.isFinal() && getOpcode() == CAST)) return (UnOpInit::get(getOpcode(), lhs, getType())) ->Fold(R.getCurrentRecord(), R.isFinal()); - return const_cast(this); + return this; } std::string UnOpInit::getAsString() const { @@ -1054,22 +1057,23 @@ std::string UnOpInit::getAsString() const { return Result + "(" + LHS->getAsString() + ")"; } -static void ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, - Init *RHS, const RecTy *Type) { +static void ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, + const Init *LHS, const Init *RHS, + const RecTy *Type) { ID.AddInteger(Opcode); ID.AddPointer(LHS); ID.AddPointer(RHS); ID.AddPointer(Type); } -BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS, - const RecTy *Type) { +const BinOpInit *BinOpInit::get(BinaryOp Opc, const Init *LHS, const Init *RHS, + const RecTy *Type) { FoldingSetNodeID ID; ProfileBinOpInit(ID, Opc, LHS, RHS, Type); detail::RecordKeeperImpl &RK = LHS->getRecordKeeper().getImpl(); void *IP = nullptr; - if (BinOpInit *I = RK.TheBinOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const BinOpInit *I = RK.TheBinOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; BinOpInit *I = new (RK.Allocator) BinOpInit(Opc, LHS, RHS, Type); @@ -1081,8 +1085,8 @@ void BinOpInit::Profile(FoldingSetNodeID &ID) const { ProfileBinOpInit(ID, getOpcode(), getLHS(), getRHS(), getType()); } -static StringInit *ConcatStringInits(const StringInit *I0, - const StringInit *I1) { +static const StringInit *ConcatStringInits(const StringInit *I0, + const StringInit *I1) { SmallString<80> Concat(I0->getValue()); Concat.append(I1->getValue()); return StringInit::get( @@ -1090,11 +1094,11 @@ static StringInit *ConcatStringInits(const StringInit *I0, StringInit::determineFormat(I0->getFormat(), I1->getFormat())); } -static StringInit *interleaveStringList(const ListInit *List, - const StringInit *Delim) { +static const StringInit *interleaveStringList(const ListInit *List, + const StringInit *Delim) { if (List->size() == 0) return StringInit::get(List->getRecordKeeper(), ""); - StringInit *Element = dyn_cast(List->getElement(0)); + const StringInit *Element = dyn_cast(List->getElement(0)); if (!Element) return nullptr; SmallString<80> Result(Element->getValue()); @@ -1102,7 +1106,7 @@ static StringInit *interleaveStringList(const ListInit *List, for (unsigned I = 1, E = List->size(); I < E; ++I) { Result.append(Delim->getValue()); - StringInit *Element = dyn_cast(List->getElement(I)); + const StringInit *Element = dyn_cast(List->getElement(I)); if (!Element) return nullptr; Result.append(Element->getValue()); @@ -1111,12 +1115,12 @@ static StringInit *interleaveStringList(const ListInit *List, return StringInit::get(List->getRecordKeeper(), Result, Fmt); } -static StringInit *interleaveIntList(const ListInit *List, - const StringInit *Delim) { +static const StringInit *interleaveIntList(const ListInit *List, + const StringInit *Delim) { RecordKeeper &RK = List->getRecordKeeper(); if (List->size() == 0) return StringInit::get(RK, ""); - IntInit *Element = dyn_cast_or_null( + const IntInit *Element = dyn_cast_or_null( List->getElement(0)->convertInitializerTo(IntRecTy::get(RK))); if (!Element) return nullptr; @@ -1124,7 +1128,7 @@ static StringInit *interleaveIntList(const ListInit *List, for (unsigned I = 1, E = List->size(); I < E; ++I) { Result.append(Delim->getValue()); - IntInit *Element = dyn_cast_or_null( + const IntInit *Element = dyn_cast_or_null( List->getElement(I)->convertInitializerTo(IntRecTy::get(RK))); if (!Element) return nullptr; @@ -1133,7 +1137,7 @@ static StringInit *interleaveIntList(const ListInit *List, return StringInit::get(RK, Result); } -Init *BinOpInit::getStrConcat(Init *I0, Init *I1) { +const Init *BinOpInit::getStrConcat(const Init *I0, const Init *I1) { // Shortcut for the common case of concatenating two strings. if (const StringInit *I0s = dyn_cast(I0)) if (const StringInit *I1s = dyn_cast(I1)) @@ -1142,15 +1146,15 @@ Init *BinOpInit::getStrConcat(Init *I0, Init *I1) { StringRecTy::get(I0->getRecordKeeper())); } -static ListInit *ConcatListInits(const ListInit *LHS, - const ListInit *RHS) { - SmallVector Args; +static const ListInit *ConcatListInits(const ListInit *LHS, + const ListInit *RHS) { + SmallVector Args; llvm::append_range(Args, *LHS); llvm::append_range(Args, *RHS); return ListInit::get(Args, LHS->getElementType()); } -Init *BinOpInit::getListConcat(TypedInit *LHS, Init *RHS) { +const Init *BinOpInit::getListConcat(const TypedInit *LHS, const Init *RHS) { assert(isa(LHS->getType()) && "First arg must be a list"); // Shortcut for the common case of concatenating two lists. @@ -1160,12 +1164,12 @@ Init *BinOpInit::getListConcat(TypedInit *LHS, Init *RHS) { return BinOpInit::get(BinOpInit::LISTCONCAT, LHS, RHS, LHS->getType()); } -std::optional BinOpInit::CompareInit(unsigned Opc, Init *LHS, - Init *RHS) const { +std::optional BinOpInit::CompareInit(unsigned Opc, const Init *LHS, + const Init *RHS) const { // First see if we have two bit, bits, or int. - IntInit *LHSi = dyn_cast_or_null( + const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); - IntInit *RHSi = dyn_cast_or_null( + const IntInit *RHSi = dyn_cast_or_null( RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); if (LHSi && RHSi) { @@ -1196,8 +1200,8 @@ std::optional BinOpInit::CompareInit(unsigned Opc, Init *LHS, } // Next try strings. - StringInit *LHSs = dyn_cast(LHS); - StringInit *RHSs = dyn_cast(RHS); + const StringInit *LHSs = dyn_cast(LHS); + const StringInit *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { bool Result; @@ -1228,8 +1232,8 @@ std::optional BinOpInit::CompareInit(unsigned Opc, Init *LHS, // Finally, !eq and !ne can be used with records. if (Opc == EQ || Opc == NE) { - DefInit *LHSd = dyn_cast(LHS); - DefInit *RHSd = dyn_cast(RHS); + const DefInit *LHSd = dyn_cast(LHS); + const DefInit *RHSd = dyn_cast(RHS); if (LHSd && RHSd) return (Opc == EQ) ? LHSd == RHSd : LHSd != RHSd; } @@ -1237,10 +1241,10 @@ std::optional BinOpInit::CompareInit(unsigned Opc, Init *LHS, return std::nullopt; } -static std::optional getDagArgNoByKey(DagInit *Dag, Init *Key, - std::string &Error) { +static std::optional +getDagArgNoByKey(const DagInit *Dag, const Init *Key, std::string &Error) { // Accessor by index - if (IntInit *Idx = dyn_cast(Key)) { + if (const IntInit *Idx = dyn_cast(Key)) { int64_t Pos = Idx->getValue(); if (Pos < 0) { // The index is negative. @@ -1260,7 +1264,7 @@ static std::optional getDagArgNoByKey(DagInit *Dag, Init *Key, } assert(isa(Key)); // Accessor by name - StringInit *Name = dyn_cast(Key); + const StringInit *Name = dyn_cast(Key); auto ArgNo = Dag->getArgNo(Name->getValue()); if (!ArgNo) { // The key is not found. @@ -1270,14 +1274,14 @@ static std::optional getDagArgNoByKey(DagInit *Dag, Init *Key, return *ArgNo; } -Init *BinOpInit::Fold(Record *CurRec) const { +const Init *BinOpInit::Fold(const Record *CurRec) const { switch (getOpcode()) { case CONCAT: { - DagInit *LHSs = dyn_cast(LHS); - DagInit *RHSs = dyn_cast(RHS); + const DagInit *LHSs = dyn_cast(LHS); + const DagInit *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { - DefInit *LOp = dyn_cast(LHSs->getOperator()); - DefInit *ROp = dyn_cast(RHSs->getOperator()); + const DefInit *LOp = dyn_cast(LHSs->getOperator()); + const DefInit *ROp = dyn_cast(RHSs->getOperator()); if ((!LOp && !isa(LHSs->getOperator())) || (!ROp && !isa(RHSs->getOperator()))) break; @@ -1286,12 +1290,12 @@ Init *BinOpInit::Fold(Record *CurRec) const { LHSs->getAsString() + "' vs. '" + RHSs->getAsString() + "'"); } - Init *Op = LOp ? LOp : ROp; + const Init *Op = LOp ? LOp : ROp; if (!Op) Op = UnsetInit::get(getRecordKeeper()); - SmallVector Args; - SmallVector ArgNames; + SmallVector Args; + SmallVector ArgNames; for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) { Args.push_back(LHSs->getArg(i)); ArgNames.push_back(LHSs->getArgName(i)); @@ -1305,10 +1309,10 @@ Init *BinOpInit::Fold(Record *CurRec) const { break; } case LISTCONCAT: { - ListInit *LHSs = dyn_cast(LHS); - ListInit *RHSs = dyn_cast(RHS); + const ListInit *LHSs = dyn_cast(LHS); + const ListInit *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { - SmallVector Args; + SmallVector Args; llvm::append_range(Args, *LHSs); llvm::append_range(Args, *RHSs); return ListInit::get(Args, LHSs->getElementType()); @@ -1316,22 +1320,22 @@ Init *BinOpInit::Fold(Record *CurRec) const { break; } case LISTSPLAT: { - TypedInit *Value = dyn_cast(LHS); - IntInit *Size = dyn_cast(RHS); + const TypedInit *Value = dyn_cast(LHS); + const IntInit *Size = dyn_cast(RHS); if (Value && Size) { - SmallVector Args(Size->getValue(), Value); + SmallVector Args(Size->getValue(), Value); return ListInit::get(Args, Value->getType()); } break; } case LISTREMOVE: { - ListInit *LHSs = dyn_cast(LHS); - ListInit *RHSs = dyn_cast(RHS); + const ListInit *LHSs = dyn_cast(LHS); + const ListInit *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { - SmallVector Args; - for (Init *EltLHS : *LHSs) { + SmallVector Args; + for (const Init *EltLHS : *LHSs) { bool Found = false; - for (Init *EltRHS : *RHSs) { + for (const Init *EltRHS : *RHSs) { if (std::optional Result = CompareInit(EQ, EltLHS, EltRHS)) { if (*Result) { Found = true; @@ -1361,7 +1365,7 @@ Init *BinOpInit::Fold(Record *CurRec) const { auto *SliceIdxs = dyn_cast(RHS); if (!TheList || !SliceIdxs) break; - SmallVector Args; + SmallVector Args; Args.reserve(SliceIdxs->size()); for (auto *I : *SliceIdxs) { auto *II = dyn_cast(I); @@ -1382,7 +1386,7 @@ Init *BinOpInit::Fold(Record *CurRec) const { auto Start = LHSi->getValue(); auto End = RHSi->getValue(); - SmallVector Args; + SmallVector Args; if (getOpcode() == RANGEC) { // Closed interval if (Start <= End) { @@ -1407,17 +1411,17 @@ Init *BinOpInit::Fold(Record *CurRec) const { return ListInit::get(Args, LHSi->getType()); } case STRCONCAT: { - StringInit *LHSs = dyn_cast(LHS); - StringInit *RHSs = dyn_cast(RHS); + const StringInit *LHSs = dyn_cast(LHS); + const StringInit *RHSs = dyn_cast(RHS); if (LHSs && RHSs) return ConcatStringInits(LHSs, RHSs); break; } case INTERLEAVE: { - ListInit *List = dyn_cast(LHS); - StringInit *Delim = dyn_cast(RHS); + const ListInit *List = dyn_cast(LHS); + const StringInit *Delim = dyn_cast(RHS); if (List && Delim) { - StringInit *Result; + const StringInit *Result; if (isa(List->getElementType())) Result = interleaveStringList(List, Delim); else @@ -1438,7 +1442,7 @@ Init *BinOpInit::Fold(Record *CurRec) const { break; } case GETDAGARG: { - DagInit *Dag = dyn_cast(LHS); + const DagInit *Dag = dyn_cast(LHS); if (Dag && isa(RHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, RHS, Error); @@ -1447,7 +1451,7 @@ Init *BinOpInit::Fold(Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); - Init *Arg = Dag->getArg(*ArgNo); + const Init *Arg = Dag->getArg(*ArgNo); if (auto *TI = dyn_cast(Arg)) if (!TI->getType()->typeIsConvertibleTo(getType())) return UnsetInit::get(Dag->getRecordKeeper()); @@ -1456,8 +1460,8 @@ Init *BinOpInit::Fold(Record *CurRec) const { break; } case GETDAGNAME: { - DagInit *Dag = dyn_cast(LHS); - IntInit *Idx = dyn_cast(RHS); + const DagInit *Dag = dyn_cast(LHS); + const IntInit *Idx = dyn_cast(RHS); if (Dag && Idx) { int64_t Pos = Idx->getValue(); if (Pos < 0 || Pos >= Dag->getNumArgs()) { @@ -1467,7 +1471,7 @@ Init *BinOpInit::Fold(Record *CurRec) const { std::to_string(Dag->getNumArgs() - 1) + ": " + std::to_string(Pos)); } - Init *ArgName = Dag->getArgName(Pos); + const Init *ArgName = Dag->getArgName(Pos); if (!ArgName) return UnsetInit::get(getRecordKeeper()); return ArgName; @@ -1475,11 +1479,11 @@ Init *BinOpInit::Fold(Record *CurRec) const { break; } case SETDAGOP: { - DagInit *Dag = dyn_cast(LHS); - DefInit *Op = dyn_cast(RHS); + const DagInit *Dag = dyn_cast(LHS); + const DefInit *Op = dyn_cast(RHS); if (Dag && Op) { - SmallVector Args; - SmallVector ArgNames; + SmallVector Args; + SmallVector ArgNames; for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) { Args.push_back(Dag->getArg(i)); ArgNames.push_back(Dag->getArgName(i)); @@ -1498,9 +1502,9 @@ Init *BinOpInit::Fold(Record *CurRec) const { case SHL: case SRA: case SRL: { - IntInit *LHSi = dyn_cast_or_null( + const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); - IntInit *RHSi = dyn_cast_or_null( + const IntInit *RHSi = dyn_cast_or_null( RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); if (LHSi && RHSi) { int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue(); @@ -1533,17 +1537,17 @@ Init *BinOpInit::Fold(Record *CurRec) const { } } unresolved: - return const_cast(this); + return this; } -Init *BinOpInit::resolveReferences(Resolver &R) const { - Init *lhs = LHS->resolveReferences(R); - Init *rhs = RHS->resolveReferences(R); +const Init *BinOpInit::resolveReferences(Resolver &R) const { + const Init *lhs = LHS->resolveReferences(R); + const Init *rhs = RHS->resolveReferences(R); if (LHS != lhs || RHS != rhs) return (BinOpInit::get(getOpcode(), lhs, rhs, getType())) ->Fold(R.getCurrentRecord()); - return const_cast(this); + return this; } std::string BinOpInit::getAsString() const { @@ -1589,8 +1593,9 @@ std::string BinOpInit::getAsString() const { return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")"; } -static void ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, - Init *MHS, Init *RHS, const RecTy *Type) { +static void ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, + const Init *LHS, const Init *MHS, const Init *RHS, + const RecTy *Type) { ID.AddInteger(Opcode); ID.AddPointer(LHS); ID.AddPointer(MHS); @@ -1598,8 +1603,9 @@ static void ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, ID.AddPointer(Type); } -TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS, - const RecTy *Type) { +const TernOpInit *TernOpInit::get(TernaryOp Opc, const Init *LHS, + const Init *MHS, const Init *RHS, + const RecTy *Type) { FoldingSetNodeID ID; ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type); @@ -1617,26 +1623,27 @@ void TernOpInit::Profile(FoldingSetNodeID &ID) const { ProfileTernOpInit(ID, getOpcode(), getLHS(), getMHS(), getRHS(), getType()); } -static Init *ItemApply(Init *LHS, Init *MHSe, Init *RHS, Record *CurRec) { +static const Init *ItemApply(const Init *LHS, const Init *MHSe, const Init *RHS, + const Record *CurRec) { MapResolver R(CurRec); R.set(LHS, MHSe); return RHS->resolveReferences(R); } -static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS, - Record *CurRec) { +static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd, + const Init *RHS, const Record *CurRec) { bool Change = false; - Init *Val = ItemApply(LHS, MHSd->getOperator(), RHS, CurRec); + const Init *Val = ItemApply(LHS, MHSd->getOperator(), RHS, CurRec); if (Val != MHSd->getOperator()) Change = true; - SmallVector, 8> NewArgs; + SmallVector, 8> NewArgs; for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) { - Init *Arg = MHSd->getArg(i); - Init *NewArg; - StringInit *ArgName = MHSd->getArgName(i); + const Init *Arg = MHSd->getArg(i); + const Init *NewArg; + const StringInit *ArgName = MHSd->getArgName(i); - if (DagInit *Argd = dyn_cast(Arg)) + if (const DagInit *Argd = dyn_cast(Arg)) NewArg = ForeachDagApply(LHS, Argd, RHS, CurRec); else NewArg = ItemApply(LHS, Arg, RHS, CurRec); @@ -1652,16 +1659,17 @@ static Init *ForeachDagApply(Init *LHS, DagInit *MHSd, Init *RHS, } // Applies RHS to all elements of MHS, using LHS as a temp variable. -static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type, - Record *CurRec) { - if (DagInit *MHSd = dyn_cast(MHS)) +static const Init *ForeachHelper(const Init *LHS, const Init *MHS, + const Init *RHS, const RecTy *Type, + const Record *CurRec) { + if (const DagInit *MHSd = dyn_cast(MHS)) return ForeachDagApply(LHS, MHSd, RHS, CurRec); - if (ListInit *MHSl = dyn_cast(MHS)) { - SmallVector NewList(MHSl->begin(), MHSl->end()); + if (const ListInit *MHSl = dyn_cast(MHS)) { + SmallVector NewList(MHSl->begin(), MHSl->end()); - for (Init *&Item : NewList) { - Init *NewItem = ItemApply(LHS, Item, RHS, CurRec); + for (const Init *&Item : NewList) { + const Init *NewItem = ItemApply(LHS, Item, RHS, CurRec); if (NewItem != Item) Item = NewItem; } @@ -1673,16 +1681,17 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type, // Evaluates RHS for all elements of MHS, using LHS as a temp variable. // Creates a new list with the elements that evaluated to true. -static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type, - Record *CurRec) { - if (ListInit *MHSl = dyn_cast(MHS)) { - SmallVector NewList; - - for (Init *Item : MHSl->getValues()) { - Init *Include = ItemApply(LHS, Item, RHS, CurRec); +static const Init *FilterHelper(const Init *LHS, const Init *MHS, + const Init *RHS, const RecTy *Type, + const Record *CurRec) { + if (const ListInit *MHSl = dyn_cast(MHS)) { + SmallVector NewList; + + for (const Init *Item : MHSl->getValues()) { + const Init *Include = ItemApply(LHS, Item, RHS, CurRec); if (!Include) return nullptr; - if (IntInit *IncludeInt = + if (const IntInit *IncludeInt = dyn_cast_or_null(Include->convertInitializerTo( IntRecTy::get(LHS->getRecordKeeper())))) { if (IncludeInt->getValue()) @@ -1697,21 +1706,21 @@ static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, const RecTy *Type, return nullptr; } -Init *TernOpInit::Fold(Record *CurRec) const { +const Init *TernOpInit::Fold(const Record *CurRec) const { RecordKeeper &RK = getRecordKeeper(); switch (getOpcode()) { case SUBST: { - DefInit *LHSd = dyn_cast(LHS); - VarInit *LHSv = dyn_cast(LHS); - StringInit *LHSs = dyn_cast(LHS); + const DefInit *LHSd = dyn_cast(LHS); + const VarInit *LHSv = dyn_cast(LHS); + const StringInit *LHSs = dyn_cast(LHS); - DefInit *MHSd = dyn_cast(MHS); - VarInit *MHSv = dyn_cast(MHS); - StringInit *MHSs = dyn_cast(MHS); + const DefInit *MHSd = dyn_cast(MHS); + const VarInit *MHSv = dyn_cast(MHS); + const StringInit *MHSs = dyn_cast(MHS); - DefInit *RHSd = dyn_cast(RHS); - VarInit *RHSv = dyn_cast(RHS); - StringInit *RHSs = dyn_cast(RHS); + const DefInit *RHSd = dyn_cast(RHS); + const VarInit *RHSv = dyn_cast(RHS); + const StringInit *RHSs = dyn_cast(RHS); if (LHSd && MHSd && RHSd) { const Record *Val = RHSd->getDef(); @@ -1745,19 +1754,19 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case FOREACH: { - if (Init *Result = ForeachHelper(LHS, MHS, RHS, getType(), CurRec)) + if (const Init *Result = ForeachHelper(LHS, MHS, RHS, getType(), CurRec)) return Result; break; } case FILTER: { - if (Init *Result = FilterHelper(LHS, MHS, RHS, getType(), CurRec)) + if (const Init *Result = FilterHelper(LHS, MHS, RHS, getType(), CurRec)) return Result; break; } case IF: { - if (IntInit *LHSi = dyn_cast_or_null( + if (const IntInit *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) { if (LHSi->getValue()) return MHS; @@ -1767,8 +1776,8 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case DAG: { - ListInit *MHSl = dyn_cast(MHS); - ListInit *RHSl = dyn_cast(RHS); + const ListInit *MHSl = dyn_cast(MHS); + const ListInit *RHSl = dyn_cast(RHS); bool MHSok = MHSl || isa(MHS); bool RHSok = RHSl || isa(RHS); @@ -1776,11 +1785,11 @@ Init *TernOpInit::Fold(Record *CurRec) const { break; // Typically prevented by the parser, but might happen with template args if (MHSok && RHSok && (!MHSl || !RHSl || MHSl->size() == RHSl->size())) { - SmallVector, 8> Children; + SmallVector, 8> Children; unsigned Size = MHSl ? MHSl->size() : RHSl->size(); for (unsigned i = 0; i != Size; ++i) { - Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get(RK); - Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get(RK); + const Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get(RK); + const Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get(RK); if (!isa(Name) && !isa(Name)) return const_cast(this); Children.emplace_back(Node, dyn_cast(Name)); @@ -1803,7 +1812,7 @@ Init *TernOpInit::Fold(Record *CurRec) const { if (Step == 0) PrintError(CurRec->getLoc(), "Step of !range can't be 0"); - SmallVector Args; + SmallVector Args; if (Start < End && Step > 0) { Args.reserve((End - Start) / Step); for (auto I = Start; I < End; I += Step) @@ -1819,9 +1828,9 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case SUBSTR: { - StringInit *LHSs = dyn_cast(LHS); - IntInit *MHSi = dyn_cast(MHS); - IntInit *RHSi = dyn_cast(RHS); + const StringInit *LHSs = dyn_cast(LHS); + const IntInit *MHSi = dyn_cast(MHS); + const IntInit *RHSi = dyn_cast(RHS); if (LHSs && MHSi && RHSi) { int64_t StringSize = LHSs->getValue().size(); int64_t Start = MHSi->getValue(); @@ -1840,9 +1849,9 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case FIND: { - StringInit *LHSs = dyn_cast(LHS); - StringInit *MHSs = dyn_cast(MHS); - IntInit *RHSi = dyn_cast(RHS); + const StringInit *LHSs = dyn_cast(LHS); + const StringInit *MHSs = dyn_cast(MHS); + const IntInit *RHSi = dyn_cast(RHS); if (LHSs && MHSs && RHSi) { int64_t SourceSize = LHSs->getValue().size(); int64_t Start = RHSi->getValue(); @@ -1860,7 +1869,7 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case SETDAGARG: { - DagInit *Dag = dyn_cast(LHS); + const DagInit *Dag = dyn_cast(LHS); if (Dag && isa(MHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, MHS, Error); @@ -1869,8 +1878,8 @@ Init *TernOpInit::Fold(Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); - SmallVector Args(Dag->getArgs()); - SmallVector Names(Dag->getArgNames()); + SmallVector Args(Dag->getArgs()); + SmallVector Names(Dag->getArgNames()); Args[*ArgNo] = RHS; return DagInit::get(Dag->getOperator(), Dag->getName(), Args, Names); } @@ -1878,7 +1887,7 @@ Init *TernOpInit::Fold(Record *CurRec) const { } case SETDAGNAME: { - DagInit *Dag = dyn_cast(LHS); + const DagInit *Dag = dyn_cast(LHS); if (Dag && isa(MHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, MHS, Error); @@ -1887,8 +1896,8 @@ Init *TernOpInit::Fold(Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); - SmallVector Args(Dag->getArgs()); - SmallVector Names(Dag->getArgNames()); + SmallVector Args(Dag->getArgs()); + SmallVector Names(Dag->getArgNames()); Names[*ArgNo] = dyn_cast(RHS); return DagInit::get(Dag->getOperator(), Dag->getName(), Args, Names); } @@ -1899,11 +1908,11 @@ Init *TernOpInit::Fold(Record *CurRec) const { return const_cast(this); } -Init *TernOpInit::resolveReferences(Resolver &R) const { - Init *lhs = LHS->resolveReferences(R); +const Init *TernOpInit::resolveReferences(Resolver &R) const { + const Init *lhs = LHS->resolveReferences(R); if (getOpcode() == IF && lhs != LHS) { - if (IntInit *Value = dyn_cast_or_null( + if (const IntInit *Value = dyn_cast_or_null( lhs->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) { // Short-circuit if (Value->getValue()) @@ -1912,8 +1921,8 @@ Init *TernOpInit::resolveReferences(Resolver &R) const { } } - Init *mhs = MHS->resolveReferences(R); - Init *rhs; + const Init *mhs = MHS->resolveReferences(R); + const Init *rhs; if (getOpcode() == FOREACH || getOpcode() == FILTER) { ShadowResolver SR(R); @@ -1926,7 +1935,7 @@ Init *TernOpInit::resolveReferences(Resolver &R) const { if (LHS != lhs || MHS != mhs || RHS != rhs) return (TernOpInit::get(getOpcode(), lhs, mhs, rhs, getType())) ->Fold(R.getCurrentRecord()); - return const_cast(this); + return this; } std::string TernOpInit::getAsString() const { @@ -1955,8 +1964,9 @@ std::string TernOpInit::getAsString() const { ", " + MHS->getAsString() + ", " + RHS->getAsString() + ")"); } -static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List, - Init *A, Init *B, Init *Expr, const RecTy *Type) { +static void ProfileFoldOpInit(FoldingSetNodeID &ID, const Init *Start, + const Init *List, const Init *A, const Init *B, + const Init *Expr, const RecTy *Type) { ID.AddPointer(Start); ID.AddPointer(List); ID.AddPointer(A); @@ -1965,14 +1975,15 @@ static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List, ID.AddPointer(Type); } -FoldOpInit *FoldOpInit::get(Init *Start, Init *List, Init *A, Init *B, - Init *Expr, const RecTy *Type) { +const FoldOpInit *FoldOpInit::get(const Init *Start, const Init *List, + const Init *A, const Init *B, + const Init *Expr, const RecTy *Type) { FoldingSetNodeID ID; ProfileFoldOpInit(ID, Start, List, A, B, Expr, Type); detail::RecordKeeperImpl &RK = Start->getRecordKeeper().getImpl(); void *IP = nullptr; - if (FoldOpInit *I = RK.TheFoldOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const FoldOpInit *I = RK.TheFoldOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; FoldOpInit *I = new (RK.Allocator) FoldOpInit(Start, List, A, B, Expr, Type); @@ -1984,10 +1995,10 @@ void FoldOpInit::Profile(FoldingSetNodeID &ID) const { ProfileFoldOpInit(ID, Start, List, A, B, Expr, getType()); } -Init *FoldOpInit::Fold(Record *CurRec) const { - if (ListInit *LI = dyn_cast(List)) { - Init *Accum = Start; - for (Init *Elt : *LI) { +const Init *FoldOpInit::Fold(const Record *CurRec) const { + if (const ListInit *LI = dyn_cast(List)) { + const Init *Accum = Start; + for (const Init *Elt : *LI) { MapResolver R(CurRec); R.set(A, Accum); R.set(B, Elt); @@ -1995,25 +2006,25 @@ Init *FoldOpInit::Fold(Record *CurRec) const { } return Accum; } - return const_cast(this); + return this; } -Init *FoldOpInit::resolveReferences(Resolver &R) const { - Init *NewStart = Start->resolveReferences(R); - Init *NewList = List->resolveReferences(R); +const Init *FoldOpInit::resolveReferences(Resolver &R) const { + const Init *NewStart = Start->resolveReferences(R); + const Init *NewList = List->resolveReferences(R); ShadowResolver SR(R); SR.addShadow(A); SR.addShadow(B); - Init *NewExpr = Expr->resolveReferences(SR); + const Init *NewExpr = Expr->resolveReferences(SR); if (Start == NewStart && List == NewList && Expr == NewExpr) - return const_cast(this); + return this; return get(NewStart, NewList, A, B, NewExpr, getType()) ->Fold(R.getCurrentRecord()); } -Init *FoldOpInit::getBit(unsigned Bit) const { +const Init *FoldOpInit::getBit(unsigned Bit) const { return VarBitInit::get(const_cast(this), Bit); } @@ -2025,19 +2036,19 @@ std::string FoldOpInit::getAsString() const { } static void ProfileIsAOpInit(FoldingSetNodeID &ID, const RecTy *CheckType, - Init *Expr) { + const Init *Expr) { ID.AddPointer(CheckType); ID.AddPointer(Expr); } -IsAOpInit *IsAOpInit::get(const RecTy *CheckType, Init *Expr) { +const IsAOpInit *IsAOpInit::get(const RecTy *CheckType, const Init *Expr) { FoldingSetNodeID ID; ProfileIsAOpInit(ID, CheckType, Expr); detail::RecordKeeperImpl &RK = Expr->getRecordKeeper().getImpl(); void *IP = nullptr; - if (IsAOpInit *I = RK.TheIsAOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const IsAOpInit *I = RK.TheIsAOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; IsAOpInit *I = new (RK.Allocator) IsAOpInit(CheckType, Expr); @@ -2049,8 +2060,8 @@ void IsAOpInit::Profile(FoldingSetNodeID &ID) const { ProfileIsAOpInit(ID, CheckType, Expr); } -Init *IsAOpInit::Fold() const { - if (TypedInit *TI = dyn_cast(Expr)) { +const Init *IsAOpInit::Fold() const { + if (const TypedInit *TI = dyn_cast(Expr)) { // Is the expression type known to be (a subclass of) the desired type? if (TI->getType()->typeIsConvertibleTo(CheckType)) return IntInit::get(getRecordKeeper(), 1); @@ -2066,17 +2077,17 @@ Init *IsAOpInit::Fold() const { return IntInit::get(getRecordKeeper(), 0); } } - return const_cast(this); + return this; } -Init *IsAOpInit::resolveReferences(Resolver &R) const { - Init *NewExpr = Expr->resolveReferences(R); +const Init *IsAOpInit::resolveReferences(Resolver &R) const { + const Init *NewExpr = Expr->resolveReferences(R); if (Expr != NewExpr) return get(CheckType, NewExpr)->Fold(); - return const_cast(this); + return this; } -Init *IsAOpInit::getBit(unsigned Bit) const { +const Init *IsAOpInit::getBit(unsigned Bit) const { return VarBitInit::get(const_cast(this), Bit); } @@ -2087,18 +2098,20 @@ std::string IsAOpInit::getAsString() const { } static void ProfileExistsOpInit(FoldingSetNodeID &ID, const RecTy *CheckType, - Init *Expr) { + const Init *Expr) { ID.AddPointer(CheckType); ID.AddPointer(Expr); } -ExistsOpInit *ExistsOpInit::get(const RecTy *CheckType, Init *Expr) { +const ExistsOpInit *ExistsOpInit::get(const RecTy *CheckType, + const Init *Expr) { FoldingSetNodeID ID; ProfileExistsOpInit(ID, CheckType, Expr); detail::RecordKeeperImpl &RK = Expr->getRecordKeeper().getImpl(); void *IP = nullptr; - if (ExistsOpInit *I = RK.TheExistsOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const ExistsOpInit *I = + RK.TheExistsOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; ExistsOpInit *I = new (RK.Allocator) ExistsOpInit(CheckType, Expr); @@ -2110,9 +2123,8 @@ void ExistsOpInit::Profile(FoldingSetNodeID &ID) const { ProfileExistsOpInit(ID, CheckType, Expr); } -Init *ExistsOpInit::Fold(Record *CurRec, bool IsFinal) const { - if (StringInit *Name = dyn_cast(Expr)) { - +const Init *ExistsOpInit::Fold(const Record *CurRec, bool IsFinal) const { + if (const StringInit *Name = dyn_cast(Expr)) { // Look up all defined records to see if we can find one. const Record *D = CheckType->getRecordKeeper().getDef(Name->getValue()); if (D) { @@ -2139,19 +2151,18 @@ Init *ExistsOpInit::Fold(Record *CurRec, bool IsFinal) const { if (IsFinal) return IntInit::get(getRecordKeeper(), 0); - return const_cast(this); } - return const_cast(this); + return this; } -Init *ExistsOpInit::resolveReferences(Resolver &R) const { - Init *NewExpr = Expr->resolveReferences(R); +const Init *ExistsOpInit::resolveReferences(Resolver &R) const { + const Init *NewExpr = Expr->resolveReferences(R); if (Expr != NewExpr || R.isFinal()) return get(CheckType, NewExpr)->Fold(R.getCurrentRecord(), R.isFinal()); - return const_cast(this); + return this; } -Init *ExistsOpInit::getBit(unsigned Bit) const { +const Init *ExistsOpInit::getBit(unsigned Bit) const { return VarBitInit::get(const_cast(this), Bit); } @@ -2161,7 +2172,7 @@ std::string ExistsOpInit::getAsString() const { .str(); } -const RecTy *TypedInit::getFieldType(StringInit *FieldName) const { +const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const { if (const RecordRecTy *RecordType = dyn_cast(getType())) { for (const Record *Rec : RecordType->getClasses()) { if (const RecordVal *Field = Rec->getValue(FieldName)) @@ -2171,7 +2182,7 @@ const RecTy *TypedInit::getFieldType(StringInit *FieldName) const { return nullptr; } -Init *TypedInit::convertInitializerTo(const RecTy *Ty) const { +const Init *TypedInit::convertInitializerTo(const RecTy *Ty) const { if (getType() == Ty || getType()->typeIsA(Ty)) return const_cast(this); @@ -2182,12 +2193,13 @@ Init *TypedInit::convertInitializerTo(const RecTy *Ty) const { return nullptr; } -Init *TypedInit::convertInitializerBitRange(ArrayRef Bits) const { +const Init * +TypedInit::convertInitializerBitRange(ArrayRef Bits) const { const BitsRecTy *T = dyn_cast(getType()); if (!T) return nullptr; // Cannot subscript a non-bits variable. unsigned NumBits = T->getNumBits(); - SmallVector NewBits; + SmallVector NewBits; NewBits.reserve(Bits.size()); for (unsigned Bit : Bits) { if (Bit >= NumBits) @@ -2198,12 +2210,12 @@ Init *TypedInit::convertInitializerBitRange(ArrayRef Bits) const { return BitsInit::get(getRecordKeeper(), NewBits); } -Init *TypedInit::getCastTo(const RecTy *Ty) const { +const Init *TypedInit::getCastTo(const RecTy *Ty) const { // Handle the common case quickly if (getType() == Ty || getType()->typeIsA(Ty)) return const_cast(this); - if (Init *Converted = convertInitializerTo(Ty)) { + if (const Init *Converted = convertInitializerTo(Ty)) { assert(!isa(Converted) || cast(Converted)->getType()->typeIsA(Ty)); return Converted; @@ -2216,12 +2228,12 @@ Init *TypedInit::getCastTo(const RecTy *Ty) const { ->Fold(nullptr); } -VarInit *VarInit::get(StringRef VN, const RecTy *T) { - Init *Value = StringInit::get(T->getRecordKeeper(), VN); +const VarInit *VarInit::get(StringRef VN, const RecTy *T) { + const Init *Value = StringInit::get(T->getRecordKeeper(), VN); return VarInit::get(Value, T); } -VarInit *VarInit::get(Init *VN, const RecTy *T) { +const VarInit *VarInit::get(const Init *VN, const RecTy *T) { detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl(); VarInit *&I = RK.TheVarInitPool[std::make_pair(T, VN)]; if (!I) @@ -2230,23 +2242,23 @@ VarInit *VarInit::get(Init *VN, const RecTy *T) { } StringRef VarInit::getName() const { - StringInit *NameString = cast(getNameInit()); + const StringInit *NameString = cast(getNameInit()); return NameString->getValue(); } -Init *VarInit::getBit(unsigned Bit) const { +const Init *VarInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) return const_cast(this); return VarBitInit::get(const_cast(this), Bit); } -Init *VarInit::resolveReferences(Resolver &R) const { - if (Init *Val = R.resolve(VarName)) +const Init *VarInit::resolveReferences(Resolver &R) const { + if (const Init *Val = R.resolve(VarName)) return Val; - return const_cast(this); + return this; } -VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) { +const VarBitInit *VarBitInit::get(const TypedInit *T, unsigned B) { detail::RecordKeeperImpl &RK = T->getRecordKeeper().getImpl(); VarBitInit *&I = RK.TheVarBitInitPool[std::make_pair(T, B)]; if (!I) @@ -2258,25 +2270,25 @@ std::string VarBitInit::getAsString() const { return TI->getAsString() + "{" + utostr(Bit) + "}"; } -Init *VarBitInit::resolveReferences(Resolver &R) const { - Init *I = TI->resolveReferences(R); +const Init *VarBitInit::resolveReferences(Resolver &R) const { + const Init *I = TI->resolveReferences(R); if (TI != I) return I->getBit(getBitNum()); - return const_cast(this); + return this; } DefInit::DefInit(const Record *D) : TypedInit(IK_DefInit, D->getType()), Def(D) {} -Init *DefInit::convertInitializerTo(const RecTy *Ty) const { +const Init *DefInit::convertInitializerTo(const RecTy *Ty) const { if (auto *RRT = dyn_cast(Ty)) if (getType()->typeIsConvertibleTo(RRT)) return const_cast(this); return nullptr; } -const RecTy *DefInit::getFieldType(StringInit *FieldName) const { +const RecTy *DefInit::getFieldType(const StringInit *FieldName) const { if (const RecordVal *RV = Def->getValue(FieldName)) return RV->getType(); return nullptr; @@ -2285,11 +2297,11 @@ const RecTy *DefInit::getFieldType(StringInit *FieldName) const { std::string DefInit::getAsString() const { return std::string(Def->getName()); } static void ProfileVarDefInit(FoldingSetNodeID &ID, Record *Class, - ArrayRef Args) { + ArrayRef Args) { ID.AddInteger(Args.size()); ID.AddPointer(Class); - for (Init *I : Args) + for (const Init *I : Args) ID.AddPointer(I); } @@ -2297,21 +2309,21 @@ VarDefInit::VarDefInit(SMLoc Loc, Record *Class, unsigned N) : TypedInit(IK_VarDefInit, RecordRecTy::get(Class)), Loc(Loc), Class(Class), NumArgs(N) {} -VarDefInit *VarDefInit::get(SMLoc Loc, Record *Class, - ArrayRef Args) { +const VarDefInit *VarDefInit::get(SMLoc Loc, Record *Class, + ArrayRef Args) { FoldingSetNodeID ID; ProfileVarDefInit(ID, Class, Args); detail::RecordKeeperImpl &RK = Class->getRecords().getImpl(); void *IP = nullptr; - if (VarDefInit *I = RK.TheVarDefInitPool.FindNodeOrInsertPos(ID, IP)) + if (const VarDefInit *I = RK.TheVarDefInitPool.FindNodeOrInsertPos(ID, IP)) return I; void *Mem = RK.Allocator.Allocate( - totalSizeToAlloc(Args.size()), alignof(VarDefInit)); + totalSizeToAlloc(Args.size()), alignof(VarDefInit)); VarDefInit *I = new (Mem) VarDefInit(Loc, Class, Args.size()); std::uninitialized_copy(Args.begin(), Args.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); RK.TheVarDefInitPool.InsertNode(I, IP); return I; } @@ -2320,7 +2332,7 @@ void VarDefInit::Profile(FoldingSetNodeID &ID) const { ProfileVarDefInit(ID, Class, args()); } -DefInit *VarDefInit::instantiate() { +const DefInit *VarDefInit::instantiate() { if (Def) return Def; @@ -2340,10 +2352,10 @@ DefInit *VarDefInit::instantiate() { NewRec->appendDumps(Class); // Substitute and resolve template arguments - ArrayRef TArgs = Class->getTemplateArgs(); + ArrayRef TArgs = Class->getTemplateArgs(); MapResolver R(NewRec); - for (Init *Arg : TArgs) { + for (const Init *Arg : TArgs) { R.set(Arg, NewRec->getValue(Arg)->getValue()); NewRec->removeValue(Arg); } @@ -2377,13 +2389,13 @@ DefInit *VarDefInit::instantiate() { return Def = NewRec->getDefInit(); } -Init *VarDefInit::resolveReferences(Resolver &R) const { +const Init *VarDefInit::resolveReferences(Resolver &R) const { TrackUnresolvedResolver UR(&R); bool Changed = false; - SmallVector NewArgs; + SmallVector NewArgs; NewArgs.reserve(args_size()); - for (ArgumentInit *Arg : args()) { + for (const ArgumentInit *Arg : args()) { auto *NewArg = cast(Arg->resolveReferences(UR)); NewArgs.push_back(NewArg); Changed |= NewArg != Arg; @@ -2392,29 +2404,29 @@ Init *VarDefInit::resolveReferences(Resolver &R) const { if (Changed) { auto *New = VarDefInit::get(Loc, Class, NewArgs); if (!UR.foundUnresolved()) - return New->instantiate(); + return const_cast(New)->instantiate(); return New; } - return const_cast(this); + return this; } -Init *VarDefInit::Fold() const { +const Init *VarDefInit::Fold() const { if (Def) return Def; TrackUnresolvedResolver R; - for (Init *Arg : args()) + for (const Init *Arg : args()) Arg->resolveReferences(R); if (!R.foundUnresolved()) return const_cast(this)->instantiate(); - return const_cast(this); + return this; } std::string VarDefInit::getAsString() const { std::string Result = Class->getNameInitAsString() + "<"; const char *sep = ""; - for (Init *Arg : args()) { + for (const Init *Arg : args()) { Result += sep; sep = ", "; Result += Arg->getAsString(); @@ -2422,7 +2434,7 @@ std::string VarDefInit::getAsString() const { return Result + ">"; } -FieldInit *FieldInit::get(Init *R, StringInit *FN) { +const FieldInit *FieldInit::get(const Init *R, const StringInit *FN) { detail::RecordKeeperImpl &RK = R->getRecordKeeper().getImpl(); FieldInit *&I = RK.TheFieldInitPool[std::make_pair(R, FN)]; if (!I) @@ -2430,28 +2442,28 @@ FieldInit *FieldInit::get(Init *R, StringInit *FN) { return I; } -Init *FieldInit::getBit(unsigned Bit) const { +const Init *FieldInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) return const_cast(this); return VarBitInit::get(const_cast(this), Bit); } -Init *FieldInit::resolveReferences(Resolver &R) const { - Init *NewRec = Rec->resolveReferences(R); +const Init *FieldInit::resolveReferences(Resolver &R) const { + const Init *NewRec = Rec->resolveReferences(R); if (NewRec != Rec) return FieldInit::get(NewRec, FieldName)->Fold(R.getCurrentRecord()); - return const_cast(this); + return this; } -Init *FieldInit::Fold(Record *CurRec) const { - if (DefInit *DI = dyn_cast(Rec)) { +const Init *FieldInit::Fold(const Record *CurRec) const { + if (const DefInit *DI = dyn_cast(Rec)) { const Record *Def = DI->getDef(); if (Def == CurRec) PrintFatalError(CurRec->getLoc(), Twine("Attempting to access field '") + FieldName->getAsUnquotedString() + "' of '" + Rec->getAsString() + "' is a forbidden self-reference"); - Init *FieldVal = Def->getValue(FieldName)->getValue(); + const Init *FieldVal = Def->getValue(FieldName)->getValue(); if (FieldVal->isConcrete()) return FieldVal; } @@ -2459,22 +2471,22 @@ Init *FieldInit::Fold(Record *CurRec) const { } bool FieldInit::isConcrete() const { - if (DefInit *DI = dyn_cast(Rec)) { - Init *FieldVal = DI->getDef()->getValue(FieldName)->getValue(); + if (const DefInit *DI = dyn_cast(Rec)) { + const Init *FieldVal = DI->getDef()->getValue(FieldName)->getValue(); return FieldVal->isConcrete(); } return false; } static void ProfileCondOpInit(FoldingSetNodeID &ID, - ArrayRef CondRange, - ArrayRef ValRange, - const RecTy *ValType) { + ArrayRef CondRange, + ArrayRef ValRange, + const RecTy *ValType) { assert(CondRange.size() == ValRange.size() && "Number of conditions and values must match!"); ID.AddPointer(ValType); - ArrayRef::iterator Case = CondRange.begin(); - ArrayRef::iterator Val = ValRange.begin(); + ArrayRef::iterator Case = CondRange.begin(); + ArrayRef::iterator Val = ValRange.begin(); while (Case != CondRange.end()) { ID.AddPointer(*Case++); @@ -2483,13 +2495,15 @@ static void ProfileCondOpInit(FoldingSetNodeID &ID, } void CondOpInit::Profile(FoldingSetNodeID &ID) const { - ProfileCondOpInit(ID, ArrayRef(getTrailingObjects(), NumConds), - ArrayRef(getTrailingObjects() + NumConds, NumConds), - ValType); + ProfileCondOpInit( + ID, ArrayRef(getTrailingObjects(), NumConds), + ArrayRef(getTrailingObjects() + NumConds, NumConds), + ValType); } -CondOpInit *CondOpInit::get(ArrayRef CondRange, - ArrayRef ValRange, const RecTy *Ty) { +const CondOpInit *CondOpInit::get(ArrayRef CondRange, + ArrayRef ValRange, + const RecTy *Ty) { assert(CondRange.size() == ValRange.size() && "Number of conditions and values must match!"); @@ -2498,33 +2512,34 @@ CondOpInit *CondOpInit::get(ArrayRef CondRange, detail::RecordKeeperImpl &RK = Ty->getRecordKeeper().getImpl(); void *IP = nullptr; - if (CondOpInit *I = RK.TheCondOpInitPool.FindNodeOrInsertPos(ID, IP)) + if (const CondOpInit *I = RK.TheCondOpInitPool.FindNodeOrInsertPos(ID, IP)) return I; void *Mem = RK.Allocator.Allocate( - totalSizeToAlloc(2 * CondRange.size()), alignof(BitsInit)); + totalSizeToAlloc(2 * CondRange.size()), alignof(BitsInit)); CondOpInit *I = new(Mem) CondOpInit(CondRange.size(), Ty); std::uninitialized_copy(CondRange.begin(), CondRange.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); std::uninitialized_copy(ValRange.begin(), ValRange.end(), - I->getTrailingObjects()+CondRange.size()); + I->getTrailingObjects() + + CondRange.size()); RK.TheCondOpInitPool.InsertNode(I, IP); return I; } -Init *CondOpInit::resolveReferences(Resolver &R) const { - SmallVector NewConds; +const Init *CondOpInit::resolveReferences(Resolver &R) const { + SmallVector NewConds; bool Changed = false; for (const Init *Case : getConds()) { - Init *NewCase = Case->resolveReferences(R); + const Init *NewCase = Case->resolveReferences(R); NewConds.push_back(NewCase); Changed |= NewCase != Case; } - SmallVector NewVals; + SmallVector NewVals; for (const Init *Val : getVals()) { - Init *NewVal = Val->resolveReferences(R); + const Init *NewVal = Val->resolveReferences(R); NewVals.push_back(NewVal); Changed |= NewVal != Val; } @@ -2533,16 +2548,16 @@ Init *CondOpInit::resolveReferences(Resolver &R) const { return (CondOpInit::get(NewConds, NewVals, getValType()))->Fold(R.getCurrentRecord()); - return const_cast(this); + return this; } -Init *CondOpInit::Fold(Record *CurRec) const { +const Init *CondOpInit::Fold(const Record *CurRec) const { RecordKeeper &RK = getRecordKeeper(); - for ( unsigned i = 0; i < NumConds; ++i) { - Init *Cond = getCond(i); - Init *Val = getVal(i); + for (unsigned i = 0; i < NumConds; ++i) { + const Init *Cond = getCond(i); + const Init *Val = getVal(i); - if (IntInit *CondI = dyn_cast_or_null( + if (const IntInit *CondI = dyn_cast_or_null( Cond->convertInitializerTo(IntRecTy::get(RK)))) { if (CondI->getValue()) return Val->convertInitializerTo(getValType()); @@ -2593,18 +2608,19 @@ std::string CondOpInit::getAsString() const { return Result + ")"; } -Init *CondOpInit::getBit(unsigned Bit) const { +const Init *CondOpInit::getBit(unsigned Bit) const { return VarBitInit::get(const_cast(this), Bit); } -static void ProfileDagInit(FoldingSetNodeID &ID, Init *V, StringInit *VN, - ArrayRef ArgRange, - ArrayRef NameRange) { +static void ProfileDagInit(FoldingSetNodeID &ID, const Init *V, + const StringInit *VN, + ArrayRef ArgRange, + ArrayRef NameRange) { ID.AddPointer(V); ID.AddPointer(VN); - ArrayRef::iterator Arg = ArgRange.begin(); - ArrayRef::iterator Name = NameRange.begin(); + ArrayRef::iterator Arg = ArgRange.begin(); + ArrayRef::iterator Name = NameRange.begin(); while (Arg != ArgRange.end()) { assert(Name != NameRange.end() && "Arg name underflow!"); ID.AddPointer(*Arg++); @@ -2613,34 +2629,36 @@ static void ProfileDagInit(FoldingSetNodeID &ID, Init *V, StringInit *VN, assert(Name == NameRange.end() && "Arg name overflow!"); } -DagInit *DagInit::get(Init *V, StringInit *VN, ArrayRef ArgRange, - ArrayRef NameRange) { +const DagInit *DagInit::get(const Init *V, const StringInit *VN, + ArrayRef ArgRange, + ArrayRef NameRange) { assert(ArgRange.size() == NameRange.size()); FoldingSetNodeID ID; ProfileDagInit(ID, V, VN, ArgRange, NameRange); detail::RecordKeeperImpl &RK = V->getRecordKeeper().getImpl(); void *IP = nullptr; - if (DagInit *I = RK.TheDagInitPool.FindNodeOrInsertPos(ID, IP)) + if (const DagInit *I = RK.TheDagInitPool.FindNodeOrInsertPos(ID, IP)) return I; - void *Mem = RK.Allocator.Allocate( - totalSizeToAlloc(ArgRange.size(), NameRange.size()), - alignof(BitsInit)); + void *Mem = + RK.Allocator.Allocate(totalSizeToAlloc( + ArgRange.size(), NameRange.size()), + alignof(BitsInit)); DagInit *I = new (Mem) DagInit(V, VN, ArgRange.size(), NameRange.size()); std::uninitialized_copy(ArgRange.begin(), ArgRange.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); std::uninitialized_copy(NameRange.begin(), NameRange.end(), - I->getTrailingObjects()); + I->getTrailingObjects()); RK.TheDagInitPool.InsertNode(I, IP); return I; } -DagInit * -DagInit::get(Init *V, StringInit *VN, - ArrayRef> args) { - SmallVector Args; - SmallVector Names; +const DagInit * +DagInit::get(const Init *V, const StringInit *VN, + ArrayRef> args) { + SmallVector Args; + SmallVector Names; for (const auto &Arg : args) { Args.push_back(Arg.first); @@ -2651,13 +2669,13 @@ DagInit::get(Init *V, StringInit *VN, } void DagInit::Profile(FoldingSetNodeID &ID) const { - ProfileDagInit(ID, Val, ValName, - ArrayRef(getTrailingObjects(), NumArgs), - ArrayRef(getTrailingObjects(), NumArgNames)); + ProfileDagInit( + ID, Val, ValName, ArrayRef(getTrailingObjects(), NumArgs), + ArrayRef(getTrailingObjects(), NumArgNames)); } const Record *DagInit::getOperatorAsDef(ArrayRef Loc) const { - if (DefInit *DefI = dyn_cast(Val)) + if (const DefInit *DefI = dyn_cast(Val)) return DefI->getDef(); PrintFatalError(Loc, "Expected record as operator"); return nullptr; @@ -2665,28 +2683,28 @@ const Record *DagInit::getOperatorAsDef(ArrayRef Loc) const { std::optional DagInit::getArgNo(StringRef Name) const { for (unsigned i = 0, e = getNumArgs(); i < e; ++i) { - StringInit *ArgName = getArgName(i); + const StringInit *ArgName = getArgName(i); if (ArgName && ArgName->getValue() == Name) return i; } return std::nullopt; } -Init *DagInit::resolveReferences(Resolver &R) const { - SmallVector NewArgs; +const Init *DagInit::resolveReferences(Resolver &R) const { + SmallVector NewArgs; NewArgs.reserve(arg_size()); bool ArgsChanged = false; for (const Init *Arg : getArgs()) { - Init *NewArg = Arg->resolveReferences(R); + const Init *NewArg = Arg->resolveReferences(R); NewArgs.push_back(NewArg); ArgsChanged |= NewArg != Arg; } - Init *Op = Val->resolveReferences(R); + const Init *Op = Val->resolveReferences(R); if (Op != Val || ArgsChanged) return DagInit::get(Op, ValName, NewArgs, getArgNames()); - return const_cast(this); + return this; } bool DagInit::isConcrete() const { @@ -2718,7 +2736,7 @@ std::string DagInit::getAsString() const { // Other implementations //===----------------------------------------------------------------------===// -RecordVal::RecordVal(Init *N, const RecTy *T, FieldKind K) +RecordVal::RecordVal(const Init *N, const RecTy *T, FieldKind K) : Name(N), TyAndKind(T, K) { setValue(UnsetInit::get(N->getRecordKeeper())); assert(Value && "Cannot create unset value for current type!"); @@ -2726,7 +2744,7 @@ RecordVal::RecordVal(Init *N, const RecTy *T, FieldKind K) // This constructor accepts the same arguments as the above, but also // a source location. -RecordVal::RecordVal(Init *N, SMLoc Loc, const RecTy *T, FieldKind K) +RecordVal::RecordVal(const Init *N, SMLoc Loc, const RecTy *T, FieldKind K) : Name(N), Loc(Loc), TyAndKind(T, K) { setValue(UnsetInit::get(N->getRecordKeeper())); assert(Value && "Cannot create unset value for current type!"); @@ -2751,7 +2769,7 @@ std::string RecordVal::getPrintType() const { } } -bool RecordVal::setValue(Init *V) { +bool RecordVal::setValue(const Init *V) { if (V) { Value = V->getCastTo(getType()); if (Value) { @@ -2759,7 +2777,7 @@ bool RecordVal::setValue(Init *V) { cast(Value)->getType()->typeIsA(getType())); if (const BitsRecTy *BTy = dyn_cast(getType())) { if (!isa(Value)) { - SmallVector Bits; + SmallVector Bits; Bits.reserve(BTy->getNumBits()); for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I) Bits.push_back(Value->getBit(I)); @@ -2775,7 +2793,7 @@ bool RecordVal::setValue(Init *V) { // This version of setValue takes a source location and resets the // location in the RecordVal. -bool RecordVal::setValue(Init *V, SMLoc NewLoc) { +bool RecordVal::setValue(const Init *V, SMLoc NewLoc) { Loc = NewLoc; if (V) { Value = V->getCastTo(getType()); @@ -2784,7 +2802,7 @@ bool RecordVal::setValue(Init *V, SMLoc NewLoc) { cast(Value)->getType()->typeIsA(getType())); if (const BitsRecTy *BTy = dyn_cast(getType())) { if (!isa(Value)) { - SmallVector Bits; + SmallVector Bits; Bits.reserve(BTy->getNumBits()); for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I) Bits.push_back(Value->getBit(I)); @@ -2847,7 +2865,7 @@ unsigned Record::getNewUID(RecordKeeper &RK) { return RK.getImpl().LastRecordID++; } -void Record::setName(Init *NewName) { +void Record::setName(const Init *NewName) { Name = NewName; checkName(); // DO NOT resolve record values to the name at this point because @@ -2893,8 +2911,8 @@ void Record::getDirectSuperClasses( } void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { - Init *OldName = getNameInit(); - Init *NewName = Name->resolveReferences(R); + const Init *OldName = getNameInit(); + const Init *NewName = Name->resolveReferences(R); if (NewName != OldName) { // Re-register with RecordKeeper. setName(NewName); @@ -2904,11 +2922,11 @@ void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { for (RecordVal &Value : Values) { if (SkipVal == &Value) // Skip resolve the same field as the given one continue; - if (Init *V = Value.getValue()) { - Init *VR = V->resolveReferences(R); + if (const Init *V = Value.getValue()) { + const Init *VR = V->resolveReferences(R); if (Value.setValue(VR)) { std::string Type; - if (TypedInit *VRT = dyn_cast(VR)) + if (const TypedInit *VRT = dyn_cast(VR)) Type = (Twine("of type '") + VRT->getType()->getAsString() + "' ").str(); PrintFatalError( @@ -2924,19 +2942,19 @@ void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { // Resolve the assertion expressions. for (auto &Assertion : Assertions) { - Init *Value = Assertion.Condition->resolveReferences(R); + const Init *Value = Assertion.Condition->resolveReferences(R); Assertion.Condition = Value; Value = Assertion.Message->resolveReferences(R); Assertion.Message = Value; } // Resolve the dump expressions. for (auto &Dump : Dumps) { - Init *Value = Dump.Message->resolveReferences(R); + const Init *Value = Dump.Message->resolveReferences(R); Dump.Message = Value; } } -void Record::resolveReferences(Init *NewName) { +void Record::resolveReferences(const Init *NewName) { RecordResolver R(*this); R.setName(NewName); R.setFinal(true); @@ -2950,7 +2968,7 @@ LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; } raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) { OS << R.getNameInitAsString(); - ArrayRef TArgs = R.getTemplateArgs(); + ArrayRef TArgs = R.getTemplateArgs(); if (!TArgs.empty()) { OS << "<"; bool NeedComma = false; @@ -2991,7 +3009,7 @@ SMLoc Record::getFieldLoc(StringRef FieldName) const { return R->getLoc(); } -Init *Record::getValueInit(StringRef FieldName) const { +const Init *Record::getValueInit(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + @@ -3015,7 +3033,7 @@ Record::getValueAsOptionalString(StringRef FieldName) const { if (isa(R->getValue())) return std::nullopt; - if (StringInit *SI = dyn_cast(R->getValue())) + if (const StringInit *SI = dyn_cast(R->getValue())) return SI->getValue(); PrintFatalError(getLoc(), @@ -3023,25 +3041,25 @@ Record::getValueAsOptionalString(StringRef FieldName) const { "' exists but does not have a string initializer!"); } -BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const { +const BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (BitsInit *BI = dyn_cast(R->getValue())) + if (const BitsInit *BI = dyn_cast(R->getValue())) return BI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a bits value"); } -ListInit *Record::getValueAsListInit(StringRef FieldName) const { +const ListInit *Record::getValueAsListInit(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (ListInit *LI = dyn_cast(R->getValue())) + if (const ListInit *LI = dyn_cast(R->getValue())) return LI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a list value"); @@ -3049,7 +3067,7 @@ ListInit *Record::getValueAsListInit(StringRef FieldName) const { std::vector Record::getValueAsListOfDefs(StringRef FieldName) const { - ListInit *List = getValueAsListInit(FieldName); + const ListInit *List = getValueAsListInit(FieldName); std::vector Defs; for (const Init *I : List->getValues()) { if (const DefInit *DI = dyn_cast(I)) @@ -3068,7 +3086,7 @@ int64_t Record::getValueAsInt(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (IntInit *II = dyn_cast(R->getValue())) + if (const IntInit *II = dyn_cast(R->getValue())) return II->getValue(); PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" + FieldName + @@ -3078,10 +3096,10 @@ int64_t Record::getValueAsInt(StringRef FieldName) const { std::vector Record::getValueAsListOfInts(StringRef FieldName) const { - ListInit *List = getValueAsListInit(FieldName); + const ListInit *List = getValueAsListInit(FieldName); std::vector Ints; - for (Init *I : List->getValues()) { - if (IntInit *II = dyn_cast(I)) + for (const Init *I : List->getValues()) { + if (const IntInit *II = dyn_cast(I)) Ints.push_back(II->getValue()); else PrintFatalError(getLoc(), @@ -3094,10 +3112,10 @@ Record::getValueAsListOfInts(StringRef FieldName) const { std::vector Record::getValueAsListOfStrings(StringRef FieldName) const { - ListInit *List = getValueAsListInit(FieldName); + const ListInit *List = getValueAsListInit(FieldName); std::vector Strings; - for (Init *I : List->getValues()) { - if (StringInit *SI = dyn_cast(I)) + for (const Init *I : List->getValues()) { + if (const StringInit *SI = dyn_cast(I)) Strings.push_back(SI->getValue()); else PrintFatalError(getLoc(), @@ -3114,7 +3132,7 @@ const Record *Record::getValueAsDef(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (DefInit *DI = dyn_cast(R->getValue())) + if (const DefInit *DI = dyn_cast(R->getValue())) return DI->getDef(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a def initializer!"); @@ -3126,7 +3144,7 @@ const Record *Record::getValueAsOptionalDef(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (DefInit *DI = dyn_cast(R->getValue())) + if (const DefInit *DI = dyn_cast(R->getValue())) return DI->getDef(); if (isa(R->getValue())) return nullptr; @@ -3140,7 +3158,7 @@ bool Record::getValueAsBit(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (BitInit *BI = dyn_cast(R->getValue())) + if (const BitInit *BI = dyn_cast(R->getValue())) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); @@ -3157,19 +3175,19 @@ bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const { return false; } Unset = false; - if (BitInit *BI = dyn_cast(R->getValue())) + if (const BitInit *BI = dyn_cast(R->getValue())) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); } -DagInit *Record::getValueAsDag(StringRef FieldName) const { +const DagInit *Record::getValueAsDag(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (DagInit *DI = dyn_cast(R->getValue())) + if (const DagInit *DI = dyn_cast(R->getValue())) return DI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a dag initializer!"); @@ -3185,8 +3203,8 @@ void Record::checkRecordAssertions() { bool AnyFailed = false; for (const auto &Assertion : getAssertions()) { - Init *Condition = Assertion.Condition->resolveReferences(R); - Init *Message = Assertion.Message->resolveReferences(R); + const Init *Condition = Assertion.Condition->resolveReferences(R); + const Init *Message = Assertion.Message->resolveReferences(R); AnyFailed |= CheckAssert(Assertion.Loc, Condition, Message); } @@ -3203,7 +3221,7 @@ void Record::emitRecordDumps() { R.setFinal(true); for (const auto &Dump : getDumps()) { - Init *Message = Dump.Message->resolveReferences(R); + const Init *Message = Dump.Message->resolveReferences(R); dumpMessage(Dump.Loc, Message); } } @@ -3241,7 +3259,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) { /// GetNewAnonymousName - Generate a unique anonymous name that can be used as /// an identifier. -Init *RecordKeeper::getNewAnonymousName() { +const Init *RecordKeeper::getNewAnonymousName() { return AnonymousNameInit::get(*this, getImpl().AnonCounter++); } @@ -3289,12 +3307,12 @@ void RecordKeeper::dumpAllocationStats(raw_ostream &OS) const { Impl->dumpAllocationStats(OS); } -Init *MapResolver::resolve(Init *VarName) { +const Init *MapResolver::resolve(const Init *VarName) { auto It = Map.find(VarName); if (It == Map.end()) return nullptr; - Init *I = It->second.V; + const Init *I = It->second.V; if (!It->second.Resolved && Map.size() > 1) { // Resolve mutual references among the mapped variables, but prevent @@ -3307,15 +3325,15 @@ Init *MapResolver::resolve(Init *VarName) { return I; } -Init *RecordResolver::resolve(Init *VarName) { - Init *Val = Cache.lookup(VarName); +const Init *RecordResolver::resolve(const Init *VarName) { + const Init *Val = Cache.lookup(VarName); if (Val) return Val; if (llvm::is_contained(Stack, VarName)) return nullptr; // prevent infinite recursion - if (RecordVal *RV = getCurrentRecord()->getValue(VarName)) { + if (const RecordVal *RV = getCurrentRecord()->getValue(VarName)) { if (!isa(RV->getValue())) { Val = RV->getValue(); Stack.push_back(VarName); @@ -3332,8 +3350,8 @@ Init *RecordResolver::resolve(Init *VarName) { return Val; } -Init *TrackUnresolvedResolver::resolve(Init *VarName) { - Init *I = nullptr; +const Init *TrackUnresolvedResolver::resolve(const Init *VarName) { + const Init *I = nullptr; if (R) { I = R->resolve(VarName); @@ -3352,8 +3370,7 @@ Init *TrackUnresolvedResolver::resolve(Init *VarName) { return I; } -Init *HasReferenceResolver::resolve(Init *VarName) -{ +const Init *HasReferenceResolver::resolve(const Init *VarName) { if (VarName == VarNameToTrack) Found = true; return nullptr; diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index aed4f3fe0e96..97a7e680e0c3 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -35,7 +35,7 @@ namespace llvm { struct SubClassReference { SMRange RefRange; Record *Rec = nullptr; - SmallVector TemplateArgs; + SmallVector TemplateArgs; SubClassReference() = default; @@ -45,7 +45,7 @@ struct SubClassReference { struct SubMultiClassReference { SMRange RefRange; MultiClass *MC = nullptr; - SmallVector TemplateArgs; + SmallVector TemplateArgs; SubMultiClassReference() = default; @@ -60,7 +60,7 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const { MC->dump(); errs() << "Template args:\n"; - for (Init *TA : TemplateArgs) + for (const Init *TA : TemplateArgs) TA->dump(); } #endif @@ -68,9 +68,9 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const { } // end namespace llvm static bool checkBitsConcrete(Record &R, const RecordVal &RV) { - BitsInit *BV = cast(RV.getValue()); + const BitsInit *BV = cast(RV.getValue()); for (unsigned i = 0, e = BV->getNumBits(); i != e; ++i) { - Init *Bit = BV->getBit(i); + const Init *Bit = BV->getBit(i); bool IsReference = false; if (auto VBI = dyn_cast(Bit)) { if (auto VI = dyn_cast(VBI->getBitVar())) { @@ -95,7 +95,7 @@ static void checkConcrete(Record &R) { if (RV.isNonconcreteOK()) continue; - if (Init *V = RV.getValue()) { + if (const Init *V = RV.getValue()) { bool Ok = isa(V) ? checkBitsConcrete(R, RV) : V->isConcrete(); if (!Ok) { PrintError(R.getLoc(), @@ -110,43 +110,45 @@ static void checkConcrete(Record &R) { /// Return an Init with a qualifier prefix referring /// to CurRec's name. -static Init *QualifyName(Record &CurRec, Init *Name) { +static const Init *QualifyName(Record &CurRec, const Init *Name) { RecordKeeper &RK = CurRec.getRecords(); - Init *NewName = BinOpInit::getStrConcat( + const Init *NewName = BinOpInit::getStrConcat( CurRec.getNameInit(), StringInit::get(RK, CurRec.isMultiClass() ? "::" : ":")); NewName = BinOpInit::getStrConcat(NewName, Name); - if (BinOpInit *BinOp = dyn_cast(NewName)) + if (const BinOpInit *BinOp = dyn_cast(NewName)) NewName = BinOp->Fold(&CurRec); return NewName; } -static Init *QualifyName(MultiClass *MC, Init *Name) { +static const Init *QualifyName(MultiClass *MC, const Init *Name) { return QualifyName(MC->Rec, Name); } /// Return the qualified version of the implicit 'NAME' template argument. -static Init *QualifiedNameOfImplicitName(Record &Rec) { +static const Init *QualifiedNameOfImplicitName(Record &Rec) { return QualifyName(Rec, StringInit::get(Rec.getRecords(), "NAME")); } -static Init *QualifiedNameOfImplicitName(MultiClass *MC) { +static const Init *QualifiedNameOfImplicitName(MultiClass *MC) { return QualifiedNameOfImplicitName(MC->Rec); } -Init *TGVarScope::getVar(RecordKeeper &Records, MultiClass *ParsingMultiClass, - StringInit *Name, SMRange NameLoc, - bool TrackReferenceLocs) const { +const Init *TGVarScope::getVar(RecordKeeper &Records, + MultiClass *ParsingMultiClass, + const StringInit *Name, SMRange NameLoc, + bool TrackReferenceLocs) const { // First, we search in local variables. auto It = Vars.find(Name->getValue()); if (It != Vars.end()) return It->second; - auto FindValueInArgs = [&](Record *Rec, StringInit *Name) -> Init * { + auto FindValueInArgs = [&](Record *Rec, + const StringInit *Name) -> const Init * { if (!Rec) return nullptr; - Init *ArgName = QualifyName(*Rec, Name); + const Init *ArgName = QualifyName(*Rec, Name); if (Rec->isTemplateArg(ArgName)) { RecordVal *RV = Rec->getValue(ArgName); assert(RV && "Template arg doesn't exist??"); @@ -184,7 +186,7 @@ Init *TGVarScope::getVar(RecordKeeper &Records, MultiClass *ParsingMultiClass, case SK_ForeachLoop: { // The variable is a loop iterator? if (CurLoop->IterVar) { - VarInit *IterVar = dyn_cast(CurLoop->IterVar); + const VarInit *IterVar = dyn_cast(CurLoop->IterVar); if (IterVar && IterVar->getNameInit() == Name) return IterVar; } @@ -226,8 +228,8 @@ bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) { /// SetValue - /// Return true on error, false on success. -bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, - ArrayRef BitList, Init *V, +bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const Init *ValName, + ArrayRef BitList, const Init *V, bool AllowSelfAssignment, bool OverrideDefLoc) { if (!V) return false; @@ -241,7 +243,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, // Do not allow assignments like 'X = X'. This will just cause infinite loops // in the resolution machinery. if (BitList.empty()) - if (VarInit *VI = dyn_cast(V)) + if (const VarInit *VI = dyn_cast(V)) if (VI->getNameInit() == ValName && !AllowSelfAssignment) return Error(Loc, "Recursion / self-assignment forbidden"); @@ -250,17 +252,17 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, // initializer. // if (!BitList.empty()) { - BitsInit *CurVal = dyn_cast(RV->getValue()); + const BitsInit *CurVal = dyn_cast(RV->getValue()); if (!CurVal) return Error(Loc, "Value '" + ValName->getAsUnquotedString() + "' is not a bits type"); // Convert the incoming value to a bits type of the appropriate size... - Init *BI = V->getCastTo(BitsRecTy::get(Records, BitList.size())); + const Init *BI = V->getCastTo(BitsRecTy::get(Records, BitList.size())); if (!BI) return Error(Loc, "Initializer is not compatible with bit range"); - SmallVector NewBits(CurVal->getNumBits()); + SmallVector NewBits(CurVal->getNumBits()); // Loop over bits, assigning values as appropriate. for (unsigned i = 0, e = BitList.size(); i != e; ++i) { @@ -280,10 +282,10 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, if (OverrideDefLoc ? RV->setValue(V, Loc) : RV->setValue(V)) { std::string InitType; - if (BitsInit *BI = dyn_cast(V)) + if (const BitsInit *BI = dyn_cast(V)) InitType = (Twine("' of type bit initializer with length ") + Twine(BI->getNumBits())).str(); - else if (TypedInit *TI = dyn_cast(V)) + else if (const TypedInit *TI = dyn_cast(V)) InitType = (Twine("' of type '") + TI->getType()->getAsString()).str(); return Error(Loc, "Field '" + ValName->getAsUnquotedString() + "' of type '" + RV->getType()->getAsString() + @@ -316,7 +318,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) { // Copy the subclass record's dumps to the new record. CurRec->appendDumps(SC); - Init *Name; + const Init *Name; if (CurRec->isClass()) Name = VarInit::get(QualifiedNameOfImplicitName(*CurRec), StringRecTy::get(Records)); @@ -427,7 +429,7 @@ bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs, MapResolver R; for (const auto &S : Substs) R.set(S.first, S.second); - Init *List = Loop.ListValue->resolveReferences(R); + const Init *List = Loop.ListValue->resolveReferences(R); // For if-then-else blocks, we lower to a foreach loop whose list is a // ternary selection between lists of different length. Since we don't @@ -437,17 +439,17 @@ bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs, // e.g. !if(!exists("__does_not_exist__"), [1], []) if (auto *TI = dyn_cast(List); TI && TI->getOpcode() == TernOpInit::IF && Final) { - Init *OldLHS = TI->getLHS(); + const Init *OldLHS = TI->getLHS(); R.setFinal(true); - Init *LHS = OldLHS->resolveReferences(R); + const Init *LHS = OldLHS->resolveReferences(R); if (LHS == OldLHS) { PrintError(Loop.Loc, Twine("unable to resolve if condition '") + LHS->getAsString() + "' at end of containing scope"); return true; } - Init *MHS = TI->getMHS(); - Init *RHS = TI->getRHS(); + const Init *MHS = TI->getMHS(); + const Init *RHS = TI->getRHS(); List = TernOpInit::get(TernOpInit::IF, LHS, MHS, RHS, TI->getType()) ->Fold(nullptr); } @@ -496,8 +498,8 @@ bool TGParser::resolve(const std::vector &Source, MapResolver R; for (const auto &S : Substs) R.set(S.first, S.second); - Init *Condition = E.Assertion->Condition->resolveReferences(R); - Init *Message = E.Assertion->Message->resolveReferences(R); + const Init *Condition = E.Assertion->Condition->resolveReferences(R); + const Init *Message = E.Assertion->Message->resolveReferences(R); if (Dest) Dest->push_back(std::make_unique( @@ -509,7 +511,7 @@ bool TGParser::resolve(const std::vector &Source, MapResolver R; for (const auto &S : Substs) R.set(S.first, S.second); - Init *Message = E.Dump->Message->resolveReferences(R); + const Init *Message = E.Dump->Message->resolveReferences(R); if (Dest) Dest->push_back( @@ -540,7 +542,7 @@ bool TGParser::resolve(const std::vector &Source, /// Resolve the record fully and add it to the record keeper. bool TGParser::addDefOne(std::unique_ptr Rec) { - Init *NewName = nullptr; + const Init *NewName = nullptr; if (const Record *Prev = Records.getDef(Rec->getNameInitAsString())) { if (!Rec->isAnonymous()) { PrintError(Rec->getLoc(), @@ -586,17 +588,18 @@ bool TGParser::addDefOne(std::unique_ptr Rec) { return false; } -bool TGParser::resolveArguments(Record *Rec, ArrayRef ArgValues, +bool TGParser::resolveArguments(Record *Rec, + ArrayRef ArgValues, SMLoc Loc, ArgValueHandler ArgValueHandler) { - ArrayRef ArgNames = Rec->getTemplateArgs(); + ArrayRef ArgNames = Rec->getTemplateArgs(); assert(ArgValues.size() <= ArgNames.size() && "Too many template arguments allowed"); // Loop over the template arguments and handle the (name, value) pair. - SmallVector UnsolvedArgNames(ArgNames); + SmallVector UnsolvedArgNames(ArgNames); for (auto *Arg : ArgValues) { - Init *ArgName = nullptr; - Init *ArgValue = Arg->getValue(); + const Init *ArgName = nullptr; + const Init *ArgValue = Arg->getValue(); if (Arg->isPositional()) ArgName = ArgNames[Arg->getIndex()]; if (Arg->isNamed()) @@ -613,7 +616,7 @@ bool TGParser::resolveArguments(Record *Rec, ArrayRef ArgValues, // For unsolved arguments, if there is no default value, complain. for (auto *UnsolvedArgName : UnsolvedArgNames) { - Init *Default = Rec->getValue(UnsolvedArgName)->getValue(); + const Init *Default = Rec->getValue(UnsolvedArgName)->getValue(); if (!Default->isComplete()) { std::string Name = UnsolvedArgName->getAsUnquotedString(); Error(Loc, "value not specified for template argument '" + Name + "'"); @@ -630,22 +633,24 @@ bool TGParser::resolveArguments(Record *Rec, ArrayRef ArgValues, /// Resolve the arguments of class and set them to MapResolver. /// Returns true if failed. bool TGParser::resolveArgumentsOfClass(MapResolver &R, Record *Rec, - ArrayRef ArgValues, + ArrayRef ArgValues, SMLoc Loc) { - return resolveArguments(Rec, ArgValues, Loc, - [&](Init *Name, Init *Value) { R.set(Name, Value); }); + return resolveArguments( + Rec, ArgValues, Loc, + [&](const Init *Name, const Init *Value) { R.set(Name, Value); }); } /// Resolve the arguments of multiclass and store them into SubstStack. /// Returns true if failed. -bool TGParser::resolveArgumentsOfMultiClass(SubstStack &Substs, MultiClass *MC, - ArrayRef ArgValues, - Init *DefmName, SMLoc Loc) { +bool TGParser::resolveArgumentsOfMultiClass( + SubstStack &Substs, MultiClass *MC, + ArrayRef ArgValues, const Init *DefmName, SMLoc Loc) { // Add an implicit argument NAME. Substs.emplace_back(QualifiedNameOfImplicitName(MC), DefmName); - return resolveArguments( - &MC->Rec, ArgValues, Loc, - [&](Init *Name, Init *Value) { Substs.emplace_back(Name, Value); }); + return resolveArguments(&MC->Rec, ArgValues, Loc, + [&](const Init *Name, const Init *Value) { + Substs.emplace_back(Name, Value); + }); } //===----------------------------------------------------------------------===// @@ -666,7 +671,7 @@ bool TGParser::consume(tgtok::TokKind K) { /// ObjectName ::= Value [ '#' Value ]* /// ObjectName ::= /*empty*/ /// -Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) { +const Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) { switch (Lex.getCode()) { case tgtok::colon: case tgtok::semi: @@ -683,12 +688,13 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) { if (CurMultiClass) CurRec = &CurMultiClass->Rec; - Init *Name = ParseValue(CurRec, StringRecTy::get(Records), ParseNameMode); + const Init *Name = + ParseValue(CurRec, StringRecTy::get(Records), ParseNameMode); if (!Name) return nullptr; if (CurMultiClass) { - Init *NameStr = QualifiedNameOfImplicitName(CurMultiClass); + const Init *NameStr = QualifiedNameOfImplicitName(CurMultiClass); HasReferenceResolver R(NameStr); Name->resolveReferences(R); if (!R.found()) @@ -827,14 +833,14 @@ ParseSubMultiClassReference(MultiClass *CurMC) { /// /// SliceElement is either IntRecTy, ListRecTy, or nullptr /// -TypedInit *TGParser::ParseSliceElement(Record *CurRec) { +const TypedInit *TGParser::ParseSliceElement(Record *CurRec) { auto LHSLoc = Lex.getLoc(); auto *CurVal = ParseValue(CurRec); if (!CurVal) return nullptr; auto *LHS = cast(CurVal); - TypedInit *RHS = nullptr; + const TypedInit *RHS = nullptr; switch (Lex.getCode()) { case tgtok::dotdotdot: case tgtok::minus: { // Deprecated @@ -891,10 +897,10 @@ TypedInit *TGParser::ParseSliceElement(Record *CurRec) { /// - Single=true /// - SliceElements is Value w/o trailing comma /// -TypedInit *TGParser::ParseSliceElements(Record *CurRec, bool Single) { - TypedInit *CurVal; - SmallVector Elems; // int - SmallVector Slices; // list +const TypedInit *TGParser::ParseSliceElements(Record *CurRec, bool Single) { + const TypedInit *CurVal; + SmallVector Elems; // int + SmallVector Slices; // list auto FlushElems = [&] { if (!Elems.empty()) { @@ -950,7 +956,7 @@ TypedInit *TGParser::ParseSliceElements(Record *CurRec, bool Single) { FlushElems(); // Concatenate lists in Slices - TypedInit *Result = nullptr; + const TypedInit *Result = nullptr; for (auto *Slice : Slices) { Result = (Result ? cast(BinOpInit::getListConcat(Result, Slice)) : Slice); @@ -966,12 +972,12 @@ TypedInit *TGParser::ParseSliceElements(Record *CurRec, bool Single) { /// RangePiece ::= INTVAL INTVAL // The last two forms are deprecated. bool TGParser::ParseRangePiece(SmallVectorImpl &Ranges, - TypedInit *FirstItem) { - Init *CurVal = FirstItem; + const TypedInit *FirstItem) { + const Init *CurVal = FirstItem; if (!CurVal) CurVal = ParseValue(nullptr); - IntInit *II = dyn_cast_or_null(CurVal); + const IntInit *II = dyn_cast_or_null(CurVal); if (!II) return TokError("expected integer or bitrange"); @@ -990,8 +996,8 @@ bool TGParser::ParseRangePiece(SmallVectorImpl &Ranges, case tgtok::minus: { Lex.Lex(); // eat - Init *I_End = ParseValue(nullptr); - IntInit *II_End = dyn_cast_or_null(I_End); + const Init *I_End = ParseValue(nullptr); + const IntInit *II_End = dyn_cast_or_null(I_End); if (!II_End) { TokError("expected integer value as end of range"); return true; @@ -1149,16 +1155,16 @@ const RecTy *TGParser::ParseType() { } /// ParseIDValue -Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMRange NameLoc, - IDParseMode Mode) { - if (Init *I = CurScope->getVar(Records, CurMultiClass, Name, NameLoc, - TrackReferenceLocs)) +const Init *TGParser::ParseIDValue(Record *CurRec, const StringInit *Name, + SMRange NameLoc, IDParseMode Mode) { + if (const Init *I = CurScope->getVar(Records, CurMultiClass, Name, NameLoc, + TrackReferenceLocs)) return I; if (Mode == ParseNameMode) return Name; - if (Init *I = Records.getGlobal(Name->getValue())) { + if (const Init *I = Records.getGlobal(Name->getValue())) { // Add a reference to the global if it's a record. if (TrackReferenceLocs) { if (auto *Def = dyn_cast(I)) @@ -1181,7 +1187,7 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMRange NameLoc, /// /// Operation ::= XOperator ['<' Type '>'] '(' Args ')' /// -Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { +const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { switch (Lex.getCode()) { default: TokError("unknown bang operator"); @@ -1291,14 +1297,14 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS = ParseValue(CurRec); + const Init *LHS = ParseValue(CurRec); if (!LHS) return nullptr; if (Code == UnOpInit::EMPTY || Code == UnOpInit::SIZE) { - ListInit *LHSl = dyn_cast(LHS); - StringInit *LHSs = dyn_cast(LHS); - DagInit *LHSd = dyn_cast(LHS); - TypedInit *LHSt = dyn_cast(LHS); + const ListInit *LHSl = dyn_cast(LHS); + const StringInit *LHSs = dyn_cast(LHS); + const DagInit *LHSd = dyn_cast(LHS); + const TypedInit *LHSt = dyn_cast(LHS); if (!LHSl && !LHSs && !LHSd && !LHSt) { TokError("expected string, list, or dag type argument in unary operator"); return nullptr; @@ -1313,8 +1319,8 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL || Code == UnOpInit::LISTFLATTEN) { - ListInit *LHSl = dyn_cast(LHS); - TypedInit *LHSt = dyn_cast(LHS); + const ListInit *LHSl = dyn_cast(LHS); + const TypedInit *LHSt = dyn_cast(LHS); if (!LHSl && !LHSt) { TokError("expected list type argument in unary operator"); return nullptr; @@ -1333,8 +1339,8 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { bool UseElementType = Code == UnOpInit::HEAD || Code == UnOpInit::LISTFLATTEN; if (LHSl) { - Init *Item = LHSl->getElement(0); - TypedInit *Itemt = dyn_cast(Item); + const Init *Item = LHSl->getElement(0); + const TypedInit *Itemt = dyn_cast(Item); if (!Itemt) { TokError("untyped list element in unary operator"); return nullptr; @@ -1381,7 +1387,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS = ParseValue(CurRec); + const Init *LHS = ParseValue(CurRec); if (!LHS) return nullptr; @@ -1390,7 +1396,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - return (IsAOpInit::get(Type, LHS))->Fold(); + return IsAOpInit::get(Type, LHS)->Fold(); } case tgtok::XExists: { @@ -1407,11 +1413,11 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } SMLoc ExprLoc = Lex.getLoc(); - Init *Expr = ParseValue(CurRec); + const Init *Expr = ParseValue(CurRec); if (!Expr) return nullptr; - TypedInit *ExprType = dyn_cast(Expr); + const TypedInit *ExprType = dyn_cast(Expr); if (!ExprType) { Error(ExprLoc, "expected string type argument in !exists operator"); return nullptr; @@ -1580,7 +1586,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - SmallVector InitList; + SmallVector InitList; // Note that this loop consumes an arbitrary number of arguments. // The actual count is checked later. @@ -1589,7 +1595,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { InitList.push_back(ParseValue(CurRec, ArgType)); if (!InitList.back()) return nullptr; - TypedInit *InitListBack = dyn_cast(InitList.back()); + const TypedInit *InitListBack = dyn_cast(InitList.back()); if (!InitListBack) { Error(OpLoc, Twine("expected value to be a typed value, got '" + InitList.back()->getAsString() + "'")); @@ -1759,7 +1765,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { Code == BinOpInit::AND || Code == BinOpInit::OR || Code == BinOpInit::XOR || Code == BinOpInit::MUL) { while (InitList.size() > 2) { - Init *RHS = InitList.pop_back_val(); + const Init *RHS = InitList.pop_back_val(); RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))->Fold(CurRec); InitList.back() = RHS; } @@ -1787,7 +1793,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - SmallVector Args; + SmallVector Args; bool FirstArgIsList = false; for (;;) { if (Args.size() >= 3) { @@ -1800,7 +1806,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!Args.back()) return nullptr; - TypedInit *ArgBack = dyn_cast(Args.back()); + const TypedInit *ArgBack = dyn_cast(Args.back()); if (!ArgBack) { Error(OpLoc, Twine("expected value to be a typed value, got '" + Args.back()->getAsString() + "'")); @@ -1838,7 +1844,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS, *MHS, *RHS; + const Init *LHS, *MHS, *RHS; auto ArgCount = Args.size(); assert(ArgCount >= 1); auto *Arg0 = cast(Args[0]); @@ -1916,7 +1922,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS = ParseValue(CurRec); + const Init *LHS = ParseValue(CurRec); if (!LHS) return nullptr; if (!consume(tgtok::comma)) { @@ -1925,7 +1931,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } SMLoc MHSLoc = Lex.getLoc(); - Init *MHS = ParseValue(CurRec, ItemType); + const Init *MHS = ParseValue(CurRec, ItemType); if (!MHS) return nullptr; @@ -1935,7 +1941,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } SMLoc RHSLoc = Lex.getLoc(); - Init *RHS = ParseValue(CurRec, ItemType); + const Init *RHS = ParseValue(CurRec, ItemType); if (!RHS) return nullptr; @@ -1947,7 +1953,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { switch (LexCode) { default: llvm_unreachable("Unhandled code!"); case tgtok::XDag: { - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { Error(MHSLoc, "could not determine type of the child list in !dag"); return nullptr; @@ -1958,7 +1964,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { Error(RHSLoc, "could not determine type of the name list in !dag"); return nullptr; @@ -1980,16 +1986,16 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { const RecTy *MHSTy = nullptr; const RecTy *RHSTy = nullptr; - if (TypedInit *MHSt = dyn_cast(MHS)) + if (const TypedInit *MHSt = dyn_cast(MHS)) MHSTy = MHSt->getType(); - if (BitsInit *MHSbits = dyn_cast(MHS)) + if (const BitsInit *MHSbits = dyn_cast(MHS)) MHSTy = BitsRecTy::get(Records, MHSbits->getNumBits()); if (isa(MHS)) MHSTy = BitRecTy::get(Records); - if (TypedInit *RHSt = dyn_cast(RHS)) + if (const TypedInit *RHSt = dyn_cast(RHS)) RHSTy = RHSt->getType(); - if (BitsInit *RHSbits = dyn_cast(RHS)) + if (const BitsInit *RHSbits = dyn_cast(RHS)) RHSTy = BitsRecTy::get(Records, RHSbits->getNumBits()); if (isa(RHS)) RHSTy = BitRecTy::get(Records); @@ -2014,7 +2020,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSubst: { - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); if (!RHSt) { TokError("could not get type for !subst"); return nullptr; @@ -2023,7 +2029,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSetDagArg: { - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt || !isa(MHSt->getType())) { Error(MHSLoc, Twine("expected integer index or string name, got ") + (MHSt ? ("type '" + MHSt->getType()->getAsString()) @@ -2034,7 +2040,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSetDagName: { - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt || !isa(MHSt->getType())) { Error(MHSLoc, Twine("expected integer index or string name, got ") + (MHSt ? ("type '" + MHSt->getType()->getAsString()) @@ -2042,7 +2048,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { "'"); return nullptr; } - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); // The name could be a string or unset. if (RHSt && !isa(RHSt->getType())) { Error(RHSLoc, Twine("expected string or unset name, got type '") + @@ -2072,11 +2078,11 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *StartUntyped = ParseValue(CurRec); + const Init *StartUntyped = ParseValue(CurRec); if (!StartUntyped) return nullptr; - TypedInit *Start = dyn_cast(StartUntyped); + const TypedInit *Start = dyn_cast(StartUntyped); if (!Start) { TokError(Twine("could not get type of !foldl start: '") + StartUntyped->getAsString() + "'"); @@ -2088,11 +2094,11 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *ListUntyped = ParseValue(CurRec); + const Init *ListUntyped = ParseValue(CurRec); if (!ListUntyped) return nullptr; - TypedInit *List = dyn_cast(ListUntyped); + const TypedInit *List = dyn_cast(ListUntyped); if (!List) { TokError(Twine("could not get type of !foldl list: '") + ListUntyped->getAsString() + "'"); @@ -2116,7 +2122,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *A = StringInit::get(Records, Lex.getCurStrVal()); + const Init *A = StringInit::get(Records, Lex.getCurStrVal()); if (CurRec && CurRec->getValue(A)) { TokError((Twine("left !foldl variable '") + A->getAsString() + "' already defined") @@ -2134,7 +2140,7 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *B = StringInit::get(Records, Lex.getCurStrVal()); + const Init *B = StringInit::get(Records, Lex.getCurStrVal()); if (CurRec && CurRec->getValue(B)) { TokError((Twine("right !foldl variable '") + B->getAsString() + "' already defined") @@ -2161,14 +2167,14 @@ Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { ParseRec->addValue(RecordVal(A, Start->getType(), RecordVal::FK_Normal)); ParseRec->addValue( RecordVal(B, ListType->getElementType(), RecordVal::FK_Normal)); - Init *ExprUntyped = ParseValue(ParseRec); + const Init *ExprUntyped = ParseValue(ParseRec); ParseRec->removeValue(A); ParseRec->removeValue(B); PopScope(FoldScope); if (!ExprUntyped) return nullptr; - TypedInit *Expr = dyn_cast(ExprUntyped); + const TypedInit *Expr = dyn_cast(ExprUntyped); if (!Expr) { TokError("could not get type of !foldl expression"); return nullptr; @@ -2226,7 +2232,8 @@ const RecTy *TGParser::ParseOperatorType() { /// Parse the !substr operation. Return null on error. /// /// Substr ::= !substr(string, start-int [, length-int]) => string -Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { +const Init *TGParser::ParseOperationSubstr(Record *CurRec, + const RecTy *ItemType) { TernOpInit::TernaryOp Code = TernOpInit::SUBSTR; const RecTy *Type = StringRecTy::get(Records); @@ -2237,7 +2244,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS = ParseValue(CurRec); + const Init *LHS = ParseValue(CurRec); if (!LHS) return nullptr; @@ -2247,12 +2254,12 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { } SMLoc MHSLoc = Lex.getLoc(); - Init *MHS = ParseValue(CurRec); + const Init *MHS = ParseValue(CurRec); if (!MHS) return nullptr; SMLoc RHSLoc = Lex.getLoc(); - Init *RHS; + const Init *RHS; if (consume(tgtok::comma)) { RHSLoc = Lex.getLoc(); RHS = ParseValue(CurRec); @@ -2273,7 +2280,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { Type->getAsString() + "'"); } - TypedInit *LHSt = dyn_cast(LHS); + const TypedInit *LHSt = dyn_cast(LHS); if (!LHSt && !isa(LHS)) { TokError("could not determine type of the string in !substr"); return nullptr; @@ -2284,7 +2291,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { return nullptr; } - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { TokError("could not determine type of the start position in !substr"); return nullptr; @@ -2296,7 +2303,7 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { } if (RHS) { - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { TokError("could not determine type of the length in !substr"); return nullptr; @@ -2314,7 +2321,8 @@ Init *TGParser::ParseOperationSubstr(Record *CurRec, const RecTy *ItemType) { /// Parse the !find operation. Return null on error. /// /// Substr ::= !find(string, string [, start-int]) => int -Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { +const Init *TGParser::ParseOperationFind(Record *CurRec, + const RecTy *ItemType) { TernOpInit::TernaryOp Code = TernOpInit::FIND; const RecTy *Type = IntRecTy::get(Records); @@ -2325,7 +2333,7 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { return nullptr; } - Init *LHS = ParseValue(CurRec); + const Init *LHS = ParseValue(CurRec); if (!LHS) return nullptr; @@ -2335,12 +2343,12 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { } SMLoc MHSLoc = Lex.getLoc(); - Init *MHS = ParseValue(CurRec); + const Init *MHS = ParseValue(CurRec); if (!MHS) return nullptr; SMLoc RHSLoc = Lex.getLoc(); - Init *RHS; + const Init *RHS; if (consume(tgtok::comma)) { RHSLoc = Lex.getLoc(); RHS = ParseValue(CurRec); @@ -2361,7 +2369,7 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { Type->getAsString() + "'"); } - TypedInit *LHSt = dyn_cast(LHS); + const TypedInit *LHSt = dyn_cast(LHS); if (!LHSt && !isa(LHS)) { TokError("could not determine type of the source string in !find"); return nullptr; @@ -2372,7 +2380,7 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { return nullptr; } - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { TokError("could not determine type of the target string in !find"); return nullptr; @@ -2384,7 +2392,7 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { } if (RHS) { - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { TokError("could not determine type of the start position in !find"); return nullptr; @@ -2403,8 +2411,8 @@ Init *TGParser::ParseOperationFind(Record *CurRec, const RecTy *ItemType) { /// /// ForEach ::= !foreach(ID, list-or-dag, expr) => list /// Filter ::= !foreach(ID, list, predicate) ==> list -Init *TGParser::ParseOperationForEachFilter(Record *CurRec, - const RecTy *ItemType) { +const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, + const RecTy *ItemType) { SMLoc OpLoc = Lex.getLoc(); tgtok::TokKind Operation = Lex.getCode(); Lex.Lex(); // eat the operation @@ -2418,7 +2426,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } - Init *LHS = StringInit::get(Records, Lex.getCurStrVal()); + const Init *LHS = StringInit::get(Records, Lex.getCurStrVal()); Lex.Lex(); // eat the ID. if (CurRec && CurRec->getValue(LHS)) { @@ -2433,7 +2441,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } - Init *MHS = ParseValue(CurRec); + const Init *MHS = ParseValue(CurRec); if (!MHS) return nullptr; @@ -2442,7 +2450,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } - TypedInit *MHSt = dyn_cast(MHS); + const TypedInit *MHSt = dyn_cast(MHS); if (!MHSt) { TokError("could not get type of !foreach/!filter list or dag"); return nullptr; @@ -2499,7 +2507,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, } TGVarScope *TempScope = PushScope(ParseRec); ParseRec->addValue(RecordVal(LHS, InEltType, RecordVal::FK_Normal)); - Init *RHS = ParseValue(ParseRec, ExprEltType); + const Init *RHS = ParseValue(ParseRec, ExprEltType); ParseRec->removeValue(LHS); PopScope(TempScope); if (!RHS) @@ -2512,7 +2520,7 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, const RecTy *OutType = InEltType; if (Operation == tgtok::XForEach && !IsDAG) { - TypedInit *RHSt = dyn_cast(RHS); + const TypedInit *RHSt = dyn_cast(RHS); if (!RHSt) { TokError("could not get type of !foreach result expression"); return nullptr; @@ -2528,7 +2536,8 @@ Init *TGParser::ParseOperationForEachFilter(Record *CurRec, ->Fold(CurRec); } -Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *ItemType) { +const Init *TGParser::ParseOperationCond(Record *CurRec, + const RecTy *ItemType) { Lex.Lex(); // eat the operation 'cond' if (!consume(tgtok::l_paren)) { @@ -2537,13 +2546,13 @@ Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *ItemType) { } // Parse through '[Case: Val,]+' - SmallVector Case; - SmallVector Val; + SmallVector Case; + SmallVector Val; while (true) { if (consume(tgtok::r_paren)) break; - Init *V = ParseValue(CurRec); + const Init *V = ParseValue(CurRec); if (!V) return nullptr; Case.push_back(V); @@ -2574,11 +2583,11 @@ Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *ItemType) { // resolve type const RecTy *Type = nullptr; - for (Init *V : Val) { + for (const Init *V : Val) { const RecTy *VTy = nullptr; - if (TypedInit *Vt = dyn_cast(V)) + if (const TypedInit *Vt = dyn_cast(V)) VTy = Vt->getType(); - if (BitsInit *Vbits = dyn_cast(V)) + if (const BitsInit *Vbits = dyn_cast(V)) VTy = BitsRecTy::get(Records, Vbits->getNumBits()); if (isa(V)) VTy = BitRecTy::get(Records); @@ -2633,9 +2642,9 @@ Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *ItemType) { /// SimpleValue ::= STRCONCATTOK '(' Value ',' Value ')' /// SimpleValue ::= COND '(' [Value ':' Value,]+ ')' /// -Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, - IDParseMode Mode) { - Init *R = nullptr; +const Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, + IDParseMode Mode) { + const Init *R = nullptr; tgtok::TokKind Code = Lex.getCode(); // Parse bang operators. @@ -2689,7 +2698,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, break; case tgtok::Id: { SMRange NameLoc = Lex.getLocRange(); - StringInit *Name = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *Name = StringInit::get(Records, Lex.getCurStrVal()); tgtok::TokKind Next = Lex.Lex(); if (Next == tgtok::equal) // Named argument. return Name; @@ -2706,7 +2715,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, return nullptr; } - SmallVector Args; + SmallVector Args; Lex.Lex(); // consume the < if (ParseTemplateArgValueList(Args, CurRec, Class)) return nullptr; // Error parsing value list. @@ -2724,7 +2733,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, case tgtok::l_brace: { // Value ::= '{' ValueList '}' SMLoc BraceLoc = Lex.getLoc(); Lex.Lex(); // eat the '{' - SmallVector Vals; + SmallVector Vals; if (Lex.getCode() != tgtok::r_brace) { ParseValueList(Vals, CurRec); @@ -2735,7 +2744,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, return nullptr; } - SmallVector NewBits; + SmallVector NewBits; // As we parse { a, b, ... }, 'a' is the highest bit, but we parse it // first. We'll first read everything in to a vector, then we can reverse @@ -2745,13 +2754,13 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, // if the API was a little more orthogonal. // bits values are allowed to initialize n bits. - if (BitsInit *BI = dyn_cast(Vals[i])) { + if (const BitsInit *BI = dyn_cast(Vals[i])) { for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) NewBits.push_back(BI->getBit((e - i) - 1)); continue; } // bits can also come from variable initializers. - if (VarInit *VI = dyn_cast(Vals[i])) { + if (const VarInit *VI = dyn_cast(Vals[i])) { if (const BitsRecTy *BitsRec = dyn_cast(VI->getType())) { for (unsigned i = 0, e = BitsRec->getNumBits(); i != e; ++i) NewBits.push_back(VI->getBit((e - i) - 1)); @@ -2760,7 +2769,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, // Fallthrough to try convert this to a bit. } // All other values must be convertible to just a single bit. - Init *Bit = Vals[i]->getCastTo(BitRecTy::get(Records)); + const Init *Bit = Vals[i]->getCastTo(BitRecTy::get(Records)); if (!Bit) { Error(BraceLoc, "Element #" + Twine(i) + " (" + Vals[i]->getAsString() + ") is not convertable to a bit"); @@ -2773,7 +2782,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, } case tgtok::l_square: { // Value ::= '[' ValueList ']' Lex.Lex(); // eat the '[' - SmallVector Vals; + SmallVector Vals; const RecTy *DeducedEltTy = nullptr; const ListRecTy *GivenListTy = nullptr; @@ -2815,8 +2824,8 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, // Check elements const RecTy *EltTy = nullptr; - for (Init *V : Vals) { - TypedInit *TArg = dyn_cast(V); + for (const Init *V : Vals) { + const TypedInit *TArg = dyn_cast(V); if (TArg) { if (EltTy) { EltTy = resolveTypes(EltTy, TArg->getType()); @@ -2872,11 +2881,11 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, return nullptr; } - Init *Operator = ParseValue(CurRec); + const Init *Operator = ParseValue(CurRec); if (!Operator) return nullptr; // If the operator name is present, parse it. - StringInit *OperatorName = nullptr; + const StringInit *OperatorName = nullptr; if (consume(tgtok::colon)) { if (Lex.getCode() != tgtok::VarName) { // eat the ':' TokError("expected variable name in dag operator"); @@ -2886,7 +2895,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, Lex.Lex(); // eat the VarName. } - SmallVector, 8> DagArgs; + SmallVector, 8> DagArgs; if (Lex.getCode() != tgtok::r_paren) { ParseDagArgList(DagArgs, CurRec); if (DagArgs.empty()) return nullptr; @@ -2911,10 +2920,10 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, /// ValueSuffix ::= '[' SliceElements ']' /// ValueSuffix ::= '.' ID /// -Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, - IDParseMode Mode) { +const Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, + IDParseMode Mode) { SMLoc LHSLoc = Lex.getLoc(); - Init *Result = ParseSimpleValue(CurRec, ItemType, Mode); + const Init *Result = ParseSimpleValue(CurRec, ItemType, Mode); if (!Result) return nullptr; // Parse the suffixes now if present. @@ -2962,7 +2971,7 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, } Lex.Lex(); // eat the '[' - TypedInit *RHS = ParseSliceElements(CurRec, /*Single=*/true); + const TypedInit *RHS = ParseSliceElements(CurRec, /*Single=*/true); if (!RHS) return nullptr; @@ -2990,7 +2999,8 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, return nullptr; } SMRange FieldNameLoc = Lex.getLocRange(); - StringInit *FieldName = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *FieldName = + StringInit::get(Records, Lex.getCurStrVal()); if (!Result->getFieldType(FieldName)) { TokError("Cannot access field '" + Lex.getCurStrVal() + "' of value '" + Result->getAsString() + "'"); @@ -3018,7 +3028,7 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, case tgtok::paste: SMLoc PasteLoc = Lex.getLoc(); - TypedInit *LHS = dyn_cast(Result); + const TypedInit *LHS = dyn_cast(Result); if (!LHS) { Error(PasteLoc, "LHS of paste is not typed!"); return nullptr; @@ -3037,7 +3047,7 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, Result = LHS; // trailing paste, ignore. break; default: - Init *RHSResult = ParseValue(CurRec, ItemType, ParseValueMode); + const Init *RHSResult = ParseValue(CurRec, ItemType, ParseValueMode); if (!RHSResult) return nullptr; Result = BinOpInit::getListConcat(LHS, RHSResult); @@ -3060,7 +3070,7 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, LHS = CastLHS; } - TypedInit *RHS = nullptr; + const TypedInit *RHS = nullptr; Lex.Lex(); // Eat the '#'. switch (Lex.getCode()) { @@ -3076,7 +3086,7 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, break; default: - Init *RHSResult = ParseValue(CurRec, nullptr, ParseNameMode); + const Init *RHSResult = ParseValue(CurRec, nullptr, ParseNameMode); if (!RHSResult) return nullptr; RHS = dyn_cast(RHSResult); @@ -3113,26 +3123,26 @@ Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, /// DagArgList ::= DagArg /// DagArgList ::= DagArgList ',' DagArg void TGParser::ParseDagArgList( - SmallVectorImpl> &Result, + SmallVectorImpl> &Result, Record *CurRec) { while (true) { // DagArg ::= VARNAME if (Lex.getCode() == tgtok::VarName) { // A missing value is treated like '?'. - StringInit *VarName = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *VarName = StringInit::get(Records, Lex.getCurStrVal()); Result.emplace_back(UnsetInit::get(Records), VarName); Lex.Lex(); } else { // DagArg ::= Value (':' VARNAME)? - Init *Val = ParseValue(CurRec); + const Init *Val = ParseValue(CurRec); if (!Val) { Result.clear(); return; } // If the variable name is present, add it. - StringInit *VarName = nullptr; + const StringInit *VarName = nullptr; if (Lex.getCode() == tgtok::colon) { if (Lex.Lex() != tgtok::VarName) { // eat the ':' TokError("expected variable name in dag literal"); @@ -3156,8 +3166,8 @@ void TGParser::ParseDagArgList( /// /// ValueList ::= Value (',' Value) /// -void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, - const RecTy *ItemType) { +void TGParser::ParseValueList(SmallVectorImpl &Result, + Record *CurRec, const RecTy *ItemType) { Result.push_back(ParseValue(CurRec, ItemType)); if (!Result.back()) { Result.clear(); @@ -3185,9 +3195,10 @@ void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, // PostionalArgValueList ::= [Value {',' Value}*] // NamedArgValueList ::= [NameValue '=' Value {',' NameValue '=' Value}*] bool TGParser::ParseTemplateArgValueList( - SmallVectorImpl &Result, Record *CurRec, Record *ArgsRec) { + SmallVectorImpl &Result, Record *CurRec, + Record *ArgsRec) { assert(Result.empty() && "Result vector is not empty"); - ArrayRef TArgs = ArgsRec->getTemplateArgs(); + ArrayRef TArgs = ArgsRec->getTemplateArgs(); if (consume(tgtok::greater)) // empty value list return false; @@ -3203,7 +3214,7 @@ bool TGParser::ParseTemplateArgValueList( SMLoc ValueLoc = Lex.getLoc(); // If we are parsing named argument, we don't need to know the argument name // and argument type will be resolved after we know the name. - Init *Value = ParseValue( + const Init *Value = ParseValue( CurRec, HasNamedArg ? nullptr : ArgsRec->getValue(TArgs[ArgIndex])->getType()); if (!Value) @@ -3216,7 +3227,7 @@ bool TGParser::ParseTemplateArgValueList( "The name of named argument should be a valid identifier"); auto *Name = cast(Value); - Init *QualifiedName = QualifyName(*ArgsRec, Name); + const Init *QualifiedName = QualifyName(*ArgsRec, Name); auto *NamedArg = ArgsRec->getValue(QualifiedName); if (!NamedArg) return Error(ValueLoc, @@ -3261,7 +3272,7 @@ bool TGParser::ParseTemplateArgValueList( /// /// Declaration ::= FIELD? Type ID ('=' Value)? /// -Init *TGParser::ParseDeclaration(Record *CurRec, +const Init *TGParser::ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs) { // Read the field prefix if present. bool HasField = consume(tgtok::Field); @@ -3286,7 +3297,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec, } SMLoc IdLoc = Lex.getLoc(); - Init *DeclName = StringInit::get(Records, Str); + const Init *DeclName = StringInit::get(Records, Str); Lex.Lex(); bool BadField; @@ -3313,7 +3324,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec, // If a value is present, parse it and set new field's value. if (consume(tgtok::equal)) { SMLoc ValLoc = Lex.getLoc(); - Init *Val = ParseValue(CurRec, Type); + const Init *Val = ParseValue(CurRec, Type); if (!Val || SetValue(CurRec, ValLoc, DeclName, {}, Val, /*AllowSelfAssignment=*/false, /*OverrideDefLoc=*/false)) { @@ -3335,13 +3346,14 @@ Init *TGParser::ParseDeclaration(Record *CurRec, /// ForeachDeclaration ::= ID '=' RangePiece /// ForeachDeclaration ::= ID '=' Value /// -VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { +const VarInit * +TGParser::ParseForeachDeclaration(const Init *&ForeachListValue) { if (Lex.getCode() != tgtok::Id) { TokError("Expected identifier in foreach declaration"); return nullptr; } - Init *DeclName = StringInit::get(Records, Lex.getCurStrVal()); + const Init *DeclName = StringInit::get(Records, Lex.getCurStrVal()); Lex.Lex(); // If a value is present, parse it. @@ -3366,11 +3378,11 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { default: { SMLoc ValueLoc = Lex.getLoc(); - Init *I = ParseValue(nullptr); + const Init *I = ParseValue(nullptr); if (!I) return nullptr; - TypedInit *TI = dyn_cast(I); + const TypedInit *TI = dyn_cast(I); if (TI && isa(TI->getType())) { ForeachListValue = I; IterType = cast(TI->getType())->getElementType(); @@ -3422,7 +3434,7 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) { Record *TheRecToAddTo = CurRec ? CurRec : &CurMultiClass->Rec; // Read the first declaration. - Init *TemplArg = ParseDeclaration(CurRec, true/*templateargs*/); + const Init *TemplArg = ParseDeclaration(CurRec, true /*templateargs*/); if (!TemplArg) return true; @@ -3479,7 +3491,7 @@ bool TGParser::ParseBodyItem(Record *CurRec) { return TokError("expected field identifier after let"); SMLoc IdLoc = Lex.getLoc(); - StringInit *FieldName = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *FieldName = StringInit::get(Records, Lex.getCurStrVal()); Lex.Lex(); // eat the field name. SmallVector BitList; @@ -3501,7 +3513,7 @@ bool TGParser::ParseBodyItem(Record *CurRec) { Type = BitsRecTy::get(Records, BitList.size()); } - Init *Val = ParseValue(CurRec, Type); + const Init *Val = ParseValue(CurRec, Type); if (!Val) return true; if (!consume(tgtok::semi)) @@ -3629,7 +3641,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) { // Parse ObjectName and make a record for it. std::unique_ptr CurRec; - Init *Name = ParseObjectName(CurMultiClass); + const Init *Name = ParseObjectName(CurMultiClass); if (!Name) return true; @@ -3665,7 +3677,7 @@ bool TGParser::ParseDefset() { if (Lex.getCode() != tgtok::Id) return TokError("expected identifier"); - StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal()); if (Records.getGlobal(DeclName->getValue())) return TokError("def or global variable of this name already exists"); @@ -3738,7 +3750,7 @@ bool TGParser::ParseDefvar(Record *CurRec) { if (Lex.getCode() != tgtok::Id) return TokError("expected identifier"); - StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *DeclName = StringInit::get(Records, Lex.getCurStrVal()); if (CurScope->varAlreadyDefined(DeclName->getValue())) return TokError("local variable of this name already exists"); @@ -3758,7 +3770,7 @@ bool TGParser::ParseDefvar(Record *CurRec) { if (!consume(tgtok::equal)) return TokError("expected '='"); - Init *Value = ParseValue(CurRec); + const Init *Value = ParseValue(CurRec); if (!Value) return true; @@ -3786,8 +3798,8 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) { // Make a temporary object to record items associated with the for // loop. - Init *ListValue = nullptr; - VarInit *IterName = ParseForeachDeclaration(ListValue); + const Init *ListValue = nullptr; + const VarInit *IterName = ParseForeachDeclaration(ListValue); if (!IterName) return TokError("expected declaration in for"); @@ -3840,7 +3852,7 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) { // Make a temporary object to record items associated with the for // loop. - Init *Condition = ParseValue(nullptr); + const Init *Condition = ParseValue(nullptr); if (!Condition) return true; @@ -3853,14 +3865,14 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) { // loop, over a list of length 0 or 1 depending on the condition, and with no // iteration variable being assigned. - ListInit *EmptyList = ListInit::get({}, BitRecTy::get(Records)); - ListInit *SingletonList = + const ListInit *EmptyList = ListInit::get({}, BitRecTy::get(Records)); + const ListInit *SingletonList = ListInit::get({BitInit::get(Records, true)}, BitRecTy::get(Records)); const RecTy *BitListTy = ListRecTy::get(BitRecTy::get(Records)); // The foreach containing the then-clause selects SingletonList if // the condition is true. - Init *ThenClauseList = + const Init *ThenClauseList = TernOpInit::get(TernOpInit::IF, Condition, SingletonList, EmptyList, BitListTy) ->Fold(nullptr); @@ -3882,7 +3894,7 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) { if (consume(tgtok::ElseKW)) { // The foreach containing the else-clause uses the same pair of lists as // above, but this time, selects SingletonList if the condition is *false*. - Init *ElseClauseList = + const Init *ElseClauseList = TernOpInit::get(TernOpInit::IF, Condition, EmptyList, SingletonList, BitListTy) ->Fold(nullptr); @@ -3942,7 +3954,7 @@ bool TGParser::ParseAssert(MultiClass *CurMultiClass, Record *CurRec) { Lex.Lex(); // Eat the 'assert' token. SMLoc ConditionLoc = Lex.getLoc(); - Init *Condition = ParseValue(CurRec); + const Init *Condition = ParseValue(CurRec); if (!Condition) return true; @@ -3951,7 +3963,7 @@ bool TGParser::ParseAssert(MultiClass *CurMultiClass, Record *CurRec) { return true; } - Init *Message = ParseValue(CurRec); + const Init *Message = ParseValue(CurRec); if (!Message) return true; @@ -4032,7 +4044,7 @@ void TGParser::ParseLetList(SmallVectorImpl &Result) { return; } - StringInit *Name = StringInit::get(Records, Lex.getCurStrVal()); + const StringInit *Name = StringInit::get(Records, Lex.getCurStrVal()); SMLoc NameLoc = Lex.getLoc(); Lex.Lex(); // Eat the identifier. @@ -4050,7 +4062,7 @@ void TGParser::ParseLetList(SmallVectorImpl &Result) { return; } - Init *Val = ParseValue(nullptr); + const Init *Val = ParseValue(nullptr); if (!Val) { Result.clear(); return; @@ -4226,7 +4238,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) { assert(Lex.getCode() == tgtok::Defm && "Unexpected token!"); Lex.Lex(); // eat the defm - Init *DefmName = ParseObjectName(CurMultiClass); + const Init *DefmName = ParseObjectName(CurMultiClass); if (!DefmName) return true; if (isa(DefmName)) { @@ -4399,11 +4411,11 @@ bool TGParser::ParseFile() { // If necessary, replace an argument with a cast to the required type. // The argument count has already been checked. bool TGParser::CheckTemplateArgValues( - SmallVectorImpl &Values, SMLoc Loc, Record *ArgsRec) { - ArrayRef TArgs = ArgsRec->getTemplateArgs(); + SmallVectorImpl &Values, SMLoc Loc, Record *ArgsRec) { + ArrayRef TArgs = ArgsRec->getTemplateArgs(); - for (llvm::ArgumentInit *&Value : Values) { - Init *ArgName = nullptr; + for (const ArgumentInit *&Value : Values) { + const Init *ArgName = nullptr; if (Value->isPositional()) ArgName = TArgs[Value->getIndex()]; if (Value->isNamed()) @@ -4412,7 +4424,7 @@ bool TGParser::CheckTemplateArgValues( RecordVal *Arg = ArgsRec->getValue(ArgName); const RecTy *ArgType = Arg->getType(); - if (TypedInit *ArgValue = dyn_cast(Value->getValue())) { + if (const TypedInit *ArgValue = dyn_cast(Value->getValue())) { auto *CastValue = ArgValue->getCastTo(ArgType); if (CastValue) { assert((!isa(CastValue) || @@ -4466,7 +4478,7 @@ bool TGParser::ParseDump(MultiClass *CurMultiClass, Record *CurRec) { assert(Lex.getCode() == tgtok::Dump && "Unknown tok"); Lex.Lex(); // eat the operation - Init *Message = ParseValue(CurRec); + const Init *Message = ParseValue(CurRec); if (!Message) return true; @@ -4485,7 +4497,7 @@ bool TGParser::ParseDump(MultiClass *CurMultiClass, Record *CurRec) { HasReferenceResolver resolver{nullptr}; resolver.setFinal(true); // force a resolution with a dummy resolver - Init *ResolvedMessage = Message->resolveReferences(resolver); + const Init *ResolvedMessage = Message->resolveReferences(resolver); addEntry(std::make_unique(Loc, ResolvedMessage)); } diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h index f33ae1ce2c81..a1f1db6622ac 100644 --- a/llvm/lib/TableGen/TGParser.h +++ b/llvm/lib/TableGen/TGParser.h @@ -27,11 +27,11 @@ struct SubClassReference; struct SubMultiClassReference; struct LetRecord { - StringInit *Name; + const StringInit *Name; std::vector Bits; - Init *Value; + const Init *Value; SMLoc Loc; - LetRecord(StringInit *N, ArrayRef B, Init *V, SMLoc L) + LetRecord(const StringInit *N, ArrayRef B, const Init *V, SMLoc L) : Name(N), Bits(B), Value(V), Loc(L) {} }; @@ -62,13 +62,13 @@ struct RecordsEntry { /// constructed by desugaring an if statement.) struct ForeachLoop { SMLoc Loc; - VarInit *IterVar; - Init *ListValue; + const VarInit *IterVar; + const Init *ListValue; std::vector Entries; void dump() const; - ForeachLoop(SMLoc Loc, VarInit *IVar, Init *LValue) + ForeachLoop(SMLoc Loc, const VarInit *IVar, const Init *LValue) : Loc(Loc), IterVar(IVar), ListValue(LValue) {} }; @@ -96,7 +96,7 @@ private: ScopeKind Kind; std::unique_ptr Parent; // A scope to hold variable definitions from defvar. - std::map> Vars; + std::map> Vars; Record *CurRec = nullptr; ForeachLoop *CurLoop = nullptr; MultiClass *CurMultiClass = nullptr; @@ -118,9 +118,9 @@ public: return std::move(Parent); } - Init *getVar(RecordKeeper &Records, MultiClass *ParsingMultiClass, - StringInit *Name, SMRange NameLoc, - bool TrackReferenceLocs) const; + const Init *getVar(RecordKeeper &Records, MultiClass *ParsingMultiClass, + const StringInit *Name, SMRange NameLoc, + bool TrackReferenceLocs) const; bool varAlreadyDefined(StringRef Name) const { // When we check whether a variable is already defined, for the purpose of @@ -130,7 +130,7 @@ public: return Vars.find(Name) != Vars.end(); } - void addVar(StringRef Name, Init *I) { + void addVar(StringRef Name, const Init *I) { bool Ins = Vars.insert(std::make_pair(std::string(Name), I)).second; (void)Ins; assert(Ins && "Local variable already exists"); @@ -228,15 +228,15 @@ private: // Semantic analysis methods. /// Set the value of a RecordVal within the given record. If `OverrideDefLoc` /// is set, the provided location overrides any existing location of the /// RecordVal. - bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName, - ArrayRef BitList, Init *V, + bool SetValue(Record *TheRec, SMLoc Loc, const Init *ValName, + ArrayRef BitList, const Init *V, bool AllowSelfAssignment = false, bool OverrideDefLoc = true); bool AddSubClass(Record *Rec, SubClassReference &SubClass); bool AddSubClass(RecordsEntry &Entry, SubClassReference &SubClass); bool AddSubMultiClass(MultiClass *CurMC, SubMultiClassReference &SubMultiClass); - using SubstStack = SmallVector, 8>; + using SubstStack = SmallVector, 8>; bool addEntry(RecordsEntry E); bool resolve(const ForeachLoop &Loop, SubstStack &Stack, bool Final, @@ -246,15 +246,16 @@ private: // Semantic analysis methods. SMLoc *Loc = nullptr); bool addDefOne(std::unique_ptr Rec); - using ArgValueHandler = std::function; + using ArgValueHandler = std::function; bool resolveArguments( - Record *Rec, ArrayRef ArgValues, SMLoc Loc, - ArgValueHandler ArgValueHandler = [](Init *, Init *) {}); + Record *Rec, ArrayRef ArgValues, SMLoc Loc, + ArgValueHandler ArgValueHandler = [](const Init *, const Init *) {}); bool resolveArgumentsOfClass(MapResolver &R, Record *Rec, - ArrayRef ArgValues, SMLoc Loc); + ArrayRef ArgValues, + SMLoc Loc); bool resolveArgumentsOfMultiClass(SubstStack &Substs, MultiClass *MC, - ArrayRef ArgValues, - Init *DefmName, SMLoc Loc); + ArrayRef ArgValues, + const Init *DefmName, SMLoc Loc); private: // Parser methods. bool consume(tgtok::TokKind K); @@ -280,45 +281,46 @@ private: // Parser methods. bool ParseBodyItem(Record *CurRec); bool ParseTemplateArgList(Record *CurRec); - Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs); - VarInit *ParseForeachDeclaration(Init *&ForeachListValue); + const Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs); + const VarInit *ParseForeachDeclaration(const Init *&ForeachListValue); SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm); SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC); - Init *ParseIDValue(Record *CurRec, StringInit *Name, SMRange NameLoc, - IDParseMode Mode = ParseValueMode); - Init *ParseSimpleValue(Record *CurRec, const RecTy *ItemType = nullptr, + const Init *ParseIDValue(Record *CurRec, const StringInit *Name, + SMRange NameLoc, IDParseMode Mode = ParseValueMode); + const Init *ParseSimpleValue(Record *CurRec, const RecTy *ItemType = nullptr, + IDParseMode Mode = ParseValueMode); + const Init *ParseValue(Record *CurRec, const RecTy *ItemType = nullptr, IDParseMode Mode = ParseValueMode); - Init *ParseValue(Record *CurRec, const RecTy *ItemType = nullptr, - IDParseMode Mode = ParseValueMode); - void ParseValueList(SmallVectorImpl &Result, Record *CurRec, + void ParseValueList(SmallVectorImpl &Result, Record *CurRec, const RecTy *ItemType = nullptr); - bool ParseTemplateArgValueList(SmallVectorImpl &Result, + bool ParseTemplateArgValueList(SmallVectorImpl &Result, Record *CurRec, Record *ArgsRec); void ParseDagArgList( - SmallVectorImpl> &Result, + SmallVectorImpl> &Result, Record *CurRec); bool ParseOptionalRangeList(SmallVectorImpl &Ranges); bool ParseOptionalBitList(SmallVectorImpl &Ranges); - TypedInit *ParseSliceElement(Record *CurRec); - TypedInit *ParseSliceElements(Record *CurRec, bool Single = false); + const TypedInit *ParseSliceElement(Record *CurRec); + const TypedInit *ParseSliceElements(Record *CurRec, bool Single = false); void ParseRangeList(SmallVectorImpl &Result); bool ParseRangePiece(SmallVectorImpl &Ranges, - TypedInit *FirstItem = nullptr); + const TypedInit *FirstItem = nullptr); const RecTy *ParseType(); - Init *ParseOperation(Record *CurRec, const RecTy *ItemType); - Init *ParseOperationSubstr(Record *CurRec, const RecTy *ItemType); - Init *ParseOperationFind(Record *CurRec, const RecTy *ItemType); - Init *ParseOperationForEachFilter(Record *CurRec, const RecTy *ItemType); - Init *ParseOperationCond(Record *CurRec, const RecTy *ItemType); + const Init *ParseOperation(Record *CurRec, const RecTy *ItemType); + const Init *ParseOperationSubstr(Record *CurRec, const RecTy *ItemType); + const Init *ParseOperationFind(Record *CurRec, const RecTy *ItemType); + const Init *ParseOperationForEachFilter(Record *CurRec, + const RecTy *ItemType); + const Init *ParseOperationCond(Record *CurRec, const RecTy *ItemType); const RecTy *ParseOperatorType(); - Init *ParseObjectName(MultiClass *CurMultiClass); + const Init *ParseObjectName(MultiClass *CurMultiClass); Record *ParseClassID(); MultiClass *ParseMultiClassID(); bool ApplyLetStack(Record *CurRec); bool ApplyLetStack(RecordsEntry &Entry); - bool CheckTemplateArgValues(SmallVectorImpl &Values, + bool CheckTemplateArgValues(SmallVectorImpl &Values, SMLoc Loc, Record *ArgsRec); }; diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index fe9621a89374..e3d9d010f9ae 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -1208,7 +1208,7 @@ ClassInfo *AsmMatcherInfo::getOperandClass(const Record *Rec, int SubOpIdx) { "Record `" + Rec->getName() + "' does not have a ParserMatchClass!\n"); - if (DefInit *DI = dyn_cast(R->getValue())) { + if (const DefInit *DI = dyn_cast(R->getValue())) { const Record *MatchClass = DI->getDef(); if (ClassInfo *CI = AsmOperandClasses[MatchClass]) return CI; @@ -1349,12 +1349,12 @@ void AsmMatcherInfo::buildRegisterClasses( } else CI->ValueName = CI->ValueName + "," + RC.getName(); - Init *DiagnosticType = Def->getValueInit("DiagnosticType"); - if (StringInit *SI = dyn_cast(DiagnosticType)) + const Init *DiagnosticType = Def->getValueInit("DiagnosticType"); + if (const StringInit *SI = dyn_cast(DiagnosticType)) CI->DiagnosticType = std::string(SI->getValue()); - Init *DiagnosticString = Def->getValueInit("DiagnosticString"); - if (StringInit *SI = dyn_cast(DiagnosticString)) + const Init *DiagnosticString = Def->getValueInit("DiagnosticString"); + if (const StringInit *SI = dyn_cast(DiagnosticString)) CI->DiagnosticString = std::string(SI->getValue()); // If we have a diagnostic string but the diagnostic type is not specified @@ -1398,9 +1398,9 @@ void AsmMatcherInfo::buildOperandClasses() { ClassInfo *CI = AsmOperandClasses[Rec]; CI->Kind = ClassInfo::UserClass0 + Index; - ListInit *Supers = Rec->getValueAsListInit("SuperClasses"); - for (Init *I : Supers->getValues()) { - DefInit *DI = dyn_cast(I); + const ListInit *Supers = Rec->getValueAsListInit("SuperClasses"); + for (const Init *I : Supers->getValues()) { + const DefInit *DI = dyn_cast(I); if (!DI) { PrintError(Rec->getLoc(), "Invalid super class reference!"); continue; @@ -1417,8 +1417,8 @@ void AsmMatcherInfo::buildOperandClasses() { CI->ValueName = std::string(Rec->getName()); // Get or construct the predicate method name. - Init *PMName = Rec->getValueInit("PredicateMethod"); - if (StringInit *SI = dyn_cast(PMName)) { + const Init *PMName = Rec->getValueInit("PredicateMethod"); + if (const StringInit *SI = dyn_cast(PMName)) { CI->PredicateMethod = std::string(SI->getValue()); } else { assert(isa(PMName) && "Unexpected PredicateMethod field!"); @@ -1426,8 +1426,8 @@ void AsmMatcherInfo::buildOperandClasses() { } // Get or construct the render method name. - Init *RMName = Rec->getValueInit("RenderMethod"); - if (StringInit *SI = dyn_cast(RMName)) { + const Init *RMName = Rec->getValueInit("RenderMethod"); + if (const StringInit *SI = dyn_cast(RMName)) { CI->RenderMethod = std::string(SI->getValue()); } else { assert(isa(RMName) && "Unexpected RenderMethod field!"); @@ -1435,29 +1435,29 @@ void AsmMatcherInfo::buildOperandClasses() { } // Get the parse method name or leave it as empty. - Init *PRMName = Rec->getValueInit("ParserMethod"); - if (StringInit *SI = dyn_cast(PRMName)) + const Init *PRMName = Rec->getValueInit("ParserMethod"); + if (const StringInit *SI = dyn_cast(PRMName)) CI->ParserMethod = std::string(SI->getValue()); // Get the diagnostic type and string or leave them as empty. - Init *DiagnosticType = Rec->getValueInit("DiagnosticType"); - if (StringInit *SI = dyn_cast(DiagnosticType)) + const Init *DiagnosticType = Rec->getValueInit("DiagnosticType"); + if (const StringInit *SI = dyn_cast(DiagnosticType)) CI->DiagnosticType = std::string(SI->getValue()); - Init *DiagnosticString = Rec->getValueInit("DiagnosticString"); - if (StringInit *SI = dyn_cast(DiagnosticString)) + const Init *DiagnosticString = Rec->getValueInit("DiagnosticString"); + if (const StringInit *SI = dyn_cast(DiagnosticString)) CI->DiagnosticString = std::string(SI->getValue()); // If we have a DiagnosticString, we need a DiagnosticType for use within // the matcher. if (!CI->DiagnosticString.empty() && CI->DiagnosticType.empty()) CI->DiagnosticType = CI->ClassName; - Init *IsOptional = Rec->getValueInit("IsOptional"); - if (BitInit *BI = dyn_cast(IsOptional)) + const Init *IsOptional = Rec->getValueInit("IsOptional"); + if (const BitInit *BI = dyn_cast(IsOptional)) CI->IsOptional = BI->getValue(); // Get or construct the default method name. - Init *DMName = Rec->getValueInit("DefaultMethod"); - if (StringInit *SI = dyn_cast(DMName)) { + const Init *DMName = Rec->getValueInit("DefaultMethod"); + if (const StringInit *SI = dyn_cast(DMName)) { CI->DefaultMethod = std::string(SI->getValue()); } else { assert(isa(DMName) && "Unexpected DefaultMethod field!"); diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp index 83205b50f6e2..3f09564cc0d6 100644 --- a/llvm/utils/TableGen/AsmWriterEmitter.cpp +++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp @@ -1031,9 +1031,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { bool IsOr = CombineType == "any_of"; // Change (any_of FeatureAll, (any_of ...)) to (any_of FeatureAll, ...). if (IsOr && D->getNumArgs() == 2 && isa(D->getArg(1))) { - DagInit *RHS = cast(D->getArg(1)); - SmallVector Args{D->getArg(0)}; - SmallVector ArgNames{D->getArgName(0)}; + const DagInit *RHS = cast(D->getArg(1)); + SmallVector Args{D->getArg(0)}; + SmallVector ArgNames{D->getArgName(0)}; for (unsigned i = 0, e = RHS->getNumArgs(); i != e; ++i) { Args.push_back(RHS->getArg(i)); ArgNames.push_back(RHS->getArgName(i)); diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp index ed00debc398c..138275356dc9 100644 --- a/llvm/utils/TableGen/Attributes.cpp +++ b/llvm/utils/TableGen/Attributes.cpp @@ -122,7 +122,7 @@ void Attributes::emitAttributeProperties(raw_ostream &OS) { bool AllowIntersectMin = KindName == "IntAttr"; for (auto *A : Records.getAllDerivedDefinitions(KindName)) { OS << "0"; - for (Init *P : *A->getValueAsListInit("Properties")) { + for (const Init *P : *A->getValueAsListInit("Properties")) { if (!AllowIntersectAnd && cast(P)->getDef()->getName() == "IntersectAnd") PrintFatalError("'IntersectAnd' only compatible with 'EnumAttr'"); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index 2a246d60de61..18e0b8fd135b 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -324,7 +324,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, IS.ParamTys.push_back(TypeList->getElementAsRecord(Idx)); // Parse the intrinsic properties. - ListInit *PropList = R->getValueAsListInit("IntrProperties"); + const ListInit *PropList = R->getValueAsListInit("IntrProperties"); for (unsigned i = 0, e = PropList->size(); i != e; ++i) { const Record *Property = PropList->getElementAsRecord(i); assert(Property->isSubClassOf("IntrinsicProperty") && diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index 4d356774f98d..be822c481528 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -348,7 +348,7 @@ CodeEmitterGen::getInstructionCases(const Record *R, void CodeEmitterGen::addInstructionCasesForEncoding( const Record *R, const Record *EncodingDef, const CodeGenTarget &Target, std::string &Case, std::string &BitOffsetCase) { - BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst"); + const BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst"); // Loop over all of the fields in the instruction, determining which are the // operands to the instruction. diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index b599ee149bcd..7876db6f33df 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -128,7 +128,7 @@ public: // Ex: ValueCols = [['true'],['false']] -- it results two columns in the // table. First column requires all the instructions to have predSense // set to 'true' and second column requires it to be 'false'. - ListInit *ColValList = MapRec->getValueAsListInit("ValueCols"); + const ListInit *ColValList = MapRec->getValueAsListInit("ValueCols"); // Each instruction map must specify at least one column for it to be valid. if (ColValList->empty()) @@ -479,7 +479,7 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n"; OS << "int " << InstrMapDesc.getName() << "(uint16_t Opcode"; if (ValueCols.size() > 1) { - for (Init *CF : ColFields->getValues()) { + for (const Init *CF : ColFields->getValues()) { std::string ColName = CF->getAsUnquotedString(); OS << ", enum " << ColName << " in" << ColName; } diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index 751ac3dd0af1..d2228c902a56 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -2639,7 +2639,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) { // If the operand has sub-operands, they may be provided by distinct // child patterns, so attempt to match each sub-operand separately. if (OperandNode->isSubClassOf("Operand")) { - DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo"); + const DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo"); if (unsigned NumArgs = MIOpInfo->getNumArgs()) { // But don't do that if the whole operand is being provided by // a single ComplexPattern-related Operand. @@ -2786,11 +2786,11 @@ TreePattern::TreePattern(const Record *TheRec, const ListInit *RawPat, bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), Infer(*this) { - for (Init *I : RawPat->getValues()) + for (const Init *I : RawPat->getValues()) Trees.push_back(ParseTreePattern(I, "")); } -TreePattern::TreePattern(const Record *TheRec, DagInit *Pat, bool isInput, +TreePattern::TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), Infer(*this) { @@ -2825,12 +2825,12 @@ void TreePattern::ComputeNamedNodes(TreePatternNode &N) { ComputeNamedNodes(N.getChild(i)); } -TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, +TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit, StringRef OpName) { RecordKeeper &RK = TheInit->getRecordKeeper(); // Here, we are creating new records (BitsInit->InitInit), so const_cast // TheInit back to non-const pointer. - if (DefInit *DI = dyn_cast(TheInit)) { + if (const DefInit *DI = dyn_cast(TheInit)) { const Record *R = DI->getDef(); // Direct reference to a leaf DagNode or PatFrag? Turn it into a @@ -2838,8 +2838,9 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, /// (foo GPR, imm) -> (foo GPR, (imm)) if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags")) return ParseTreePattern( - DagInit::get(DI, nullptr, - std::vector>()), + DagInit::get( + DI, nullptr, + std::vector>()), OpName); // Input argument? @@ -2872,22 +2873,22 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, return makeIntrusiveRefCnt(TheInit, 1); } - if (BitsInit *BI = dyn_cast(TheInit)) { + if (const BitsInit *BI = dyn_cast(TheInit)) { // Turn this into an IntInit. - Init *II = BI->convertInitializerTo(IntRecTy::get(RK)); + const Init *II = BI->convertInitializerTo(IntRecTy::get(RK)); if (!II || !isa(II)) error("Bits value must be constants!"); return II ? ParseTreePattern(II, OpName) : nullptr; } - DagInit *Dag = dyn_cast(TheInit); + const DagInit *Dag = dyn_cast(TheInit); if (!Dag) { TheInit->print(errs()); error("Pattern has unexpected init kind!"); return nullptr; } - auto ParseCastOperand = [this](DagInit *Dag, StringRef OpName) { + auto ParseCastOperand = [this](const DagInit *Dag, StringRef OpName) { if (Dag->getNumArgs() != 1) error("Type cast only takes one operand!"); @@ -2897,7 +2898,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, return ParseTreePattern(Dag->getArg(0), Dag->getArgNameStr(0)); }; - if (ListInit *LI = dyn_cast(Dag->getOperator())) { + if (const ListInit *LI = dyn_cast(Dag->getOperator())) { // If the operator is a list (of value types), then this must be "type cast" // of a leaf node with multiple results. TreePatternNodePtr New = ParseCastOperand(Dag, OpName); @@ -2915,7 +2916,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(Init *TheInit, return New; } - DefInit *OpDef = dyn_cast(Dag->getOperator()); + const DefInit *OpDef = dyn_cast(Dag->getOperator()); if (!OpDef) { error("Pattern has unexpected operator type!"); return nullptr; @@ -3252,7 +3253,7 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { if (OutFrags != Frag->isSubClassOf("OutPatFrag")) continue; - ListInit *LI = Frag->getValueAsListInit("Fragments"); + const ListInit *LI = Frag->getValueAsListInit("Fragments"); TreePattern *P = (PatternFragments[Frag] = std::make_unique( Frag, LI, !Frag->isSubClassOf("OutPatFrag"), *this)) .get(); @@ -3268,8 +3269,8 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) { P->error("Cannot have unnamed 'node' values in pattern fragment!"); // Parse the operands list. - DagInit *OpsList = Frag->getValueAsDag("Operands"); - DefInit *OpsOp = dyn_cast(OpsList->getOperator()); + const DagInit *OpsList = Frag->getValueAsDag("Operands"); + const DefInit *OpsOp = dyn_cast(OpsList->getOperator()); // Special cases: ops == outs == ins. Different names are used to // improve readability. if (!OpsOp || (OpsOp->getDef()->getName() != "ops" && @@ -3336,18 +3337,18 @@ void CodeGenDAGPatterns::ParseDefaultOperands() { // Find some SDNode. assert(!SDNodes.empty() && "No SDNodes parsed?"); - Init *SomeSDNode = SDNodes.begin()->first->getDefInit(); + const Init *SomeSDNode = SDNodes.begin()->first->getDefInit(); for (unsigned i = 0, e = DefaultOps.size(); i != e; ++i) { - DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps"); + const DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps"); // Clone the DefaultInfo dag node, changing the operator from 'ops' to // SomeSDnode so that we can parse this. - std::vector> Ops; + std::vector> Ops; for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op) Ops.push_back( std::pair(DefaultInfo->getArg(op), DefaultInfo->getArgName(op))); - DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops); + const DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops); // Create a TreePattern to parse this. TreePattern P(DefaultOps[i], DI, false, *this); @@ -3694,8 +3695,8 @@ static bool InferFromPattern(CodeGenInstruction &InstInfo, /// hasNullFragReference - Return true if the DAG has any reference to the /// null_frag operator. -static bool hasNullFragReference(DagInit *DI) { - DefInit *OpDef = dyn_cast(DI->getOperator()); +static bool hasNullFragReference(const DagInit *DI) { + const DefInit *OpDef = dyn_cast(DI->getOperator()); if (!OpDef) return false; const Record *Operator = OpDef->getDef(); @@ -3708,7 +3709,7 @@ static bool hasNullFragReference(DagInit *DI) { if (auto Arg = dyn_cast(DI->getArg(i))) if (Arg->getDef()->getName() == "null_frag") return true; - DagInit *Arg = dyn_cast(DI->getArg(i)); + const DagInit *Arg = dyn_cast(DI->getArg(i)); if (Arg && hasNullFragReference(Arg)) return true; } @@ -3718,9 +3719,9 @@ static bool hasNullFragReference(DagInit *DI) { /// hasNullFragReference - Return true if any DAG in the list references /// the null_frag operator. -static bool hasNullFragReference(ListInit *LI) { - for (Init *I : LI->getValues()) { - DagInit *DI = dyn_cast(I); +static bool hasNullFragReference(const ListInit *LI) { + for (const Init *I : LI->getValues()) { + const DagInit *DI = dyn_cast(I); assert(DI && "non-dag in an instruction Pattern list?!"); if (hasNullFragReference(DI)) return true; @@ -3948,7 +3949,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI, /// resolved instructions. void CodeGenDAGPatterns::ParseInstructions() { for (const Record *Instr : Records.getAllDerivedDefinitions("Instruction")) { - ListInit *LI = nullptr; + const ListInit *LI = nullptr; if (isa(Instr->getValueInit("Pattern"))) LI = Instr->getValueAsListInit("Pattern"); @@ -4310,7 +4311,7 @@ void CodeGenDAGPatterns::ParseOnePattern( TreePattern Temp(Result.getRecord(), DstShared, false, *this); Temp.InferAllTypes(); - ListInit *Preds = TheDef->getValueAsListInit("Predicates"); + const ListInit *Preds = TheDef->getValueAsListInit("Predicates"); int Complexity = TheDef->getValueAsInt("AddedComplexity"); if (PatternRewriter) @@ -4345,7 +4346,7 @@ void CodeGenDAGPatterns::ParseOnePattern( void CodeGenDAGPatterns::ParsePatterns() { for (const Record *CurPattern : Records.getAllDerivedDefinitions("Pattern")) { - DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch"); + const DagInit *Tree = CurPattern->getValueAsDag("PatternToMatch"); // If the pattern references the null_frag, there's nothing to do. if (hasNullFragReference(Tree)) @@ -4353,7 +4354,7 @@ void CodeGenDAGPatterns::ParsePatterns() { TreePattern Pattern(CurPattern, Tree, true, *this); - ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs"); + const ListInit *LI = CurPattern->getValueAsListInit("ResultInstrs"); if (LI->empty()) continue; // no pattern. diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h index 1da7deae0a84..f85753ff5ac8 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h @@ -910,7 +910,7 @@ public: /// current record. TreePattern(const Record *TheRec, const ListInit *RawPat, bool isInput, CodeGenDAGPatterns &ise); - TreePattern(const Record *TheRec, DagInit *Pat, bool isInput, + TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput, CodeGenDAGPatterns &ise); TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput, CodeGenDAGPatterns &ise); @@ -975,7 +975,7 @@ public: void dump() const; private: - TreePatternNodePtr ParseTreePattern(Init *DI, StringRef OpName); + TreePatternNodePtr ParseTreePattern(const Init *DI, StringRef OpName); void ComputeNamedNodes(); void ComputeNamedNodes(TreePatternNode &N); }; @@ -1055,7 +1055,7 @@ public: /// processed to produce isel. class PatternToMatch { const Record *SrcRecord; // Originating Record for the pattern. - ListInit *Predicates; // Top level predicate conditions to match. + const ListInit *Predicates; // Top level predicate conditions to match. TreePatternNodePtr SrcPattern; // Source pattern to match. TreePatternNodePtr DstPattern; // Resulting pattern. std::vector Dstregs; // Physical register defs being matched. @@ -1065,7 +1065,7 @@ class PatternToMatch { unsigned ID; // Unique ID for the record. public: - PatternToMatch(const Record *srcrecord, ListInit *preds, + PatternToMatch(const Record *srcrecord, const ListInit *preds, TreePatternNodePtr src, TreePatternNodePtr dst, ArrayRef dstregs, int complexity, unsigned uid, bool ignore, const Twine &hwmodefeatures = "") @@ -1074,7 +1074,7 @@ public: AddedComplexity(complexity), GISelShouldIgnore(ignore), ID(uid) {} const Record *getSrcRecord() const { return SrcRecord; } - ListInit *getPredicates() const { return Predicates; } + const ListInit *getPredicates() const { return Predicates; } TreePatternNode &getSrcPattern() const { return *SrcPattern; } TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; } TreePatternNode &getDstPattern() const { return *DstPattern; } diff --git a/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp b/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp index 69e00295bf5b..293ed76e0f50 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp +++ b/llvm/utils/TableGen/Common/CodeGenInstAlias.cpp @@ -67,7 +67,7 @@ bool CodeGenInstAlias::tryAliasOpMatch(const DagInit *Result, // Handle explicit registers. if (ADI && ADI->getDef()->isSubClassOf("Register")) { if (InstOpRec->isSubClassOf("OptionalDefOperand")) { - DagInit *DI = InstOpRec->getValueAsDag("MIOperandInfo"); + const DagInit *DI = InstOpRec->getValueAsDag("MIOperandInfo"); // The operand info should only have a single (register) entry. We // want the register class of it. InstOpRec = cast(DI->getArg(0))->getDef(); @@ -172,7 +172,7 @@ CodeGenInstAlias::CodeGenInstAlias(const Record *R, const CodeGenTarget &T) AsmString = std::string(R->getValueAsString("AsmString")); // Verify that the root of the result is an instruction. - DefInit *DI = dyn_cast(Result->getOperator()); + const DefInit *DI = dyn_cast(Result->getOperator()); if (!DI || !DI->getDef()->isSubClassOf("Instruction")) PrintFatalError(R->getLoc(), "result of inst alias should be an instruction"); diff --git a/llvm/utils/TableGen/Common/CodeGenInstAlias.h b/llvm/utils/TableGen/Common/CodeGenInstAlias.h index 00680b0f2da7..f045b9f6c199 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstAlias.h +++ b/llvm/utils/TableGen/Common/CodeGenInstAlias.h @@ -39,7 +39,7 @@ public: std::string AsmString; /// Result - The result instruction. - DagInit *Result; + const DagInit *Result; /// ResultInst - The instruction generated by the alias (decoded from /// Result). diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp index 7fedc17701c4..1c0ab594d931 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp @@ -27,9 +27,9 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) { hasOptionalDef = false; isVariadic = false; - DagInit *OutDI = R->getValueAsDag("OutOperandList"); + const DagInit *OutDI = R->getValueAsDag("OutOperandList"); - if (DefInit *Init = dyn_cast(OutDI->getOperator())) { + if (const DefInit *Init = dyn_cast(OutDI->getOperator())) { if (Init->getDef()->getName() != "outs") PrintFatalError(R->getLoc(), R->getName() + @@ -40,8 +40,8 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) { NumDefs = OutDI->getNumArgs(); - DagInit *InDI = R->getValueAsDag("InOperandList"); - if (DefInit *Init = dyn_cast(InDI->getOperator())) { + const DagInit *InDI = R->getValueAsDag("InOperandList"); + if (const DefInit *Init = dyn_cast(InDI->getOperator())) { if (Init->getDef()->getName() != "ins") PrintFatalError(R->getLoc(), R->getName() + @@ -56,7 +56,7 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) { OperandList.reserve(e); bool VariadicOuts = false; for (unsigned i = 0; i != e; ++i) { - Init *ArgInit; + const Init *ArgInit; StringRef ArgName; if (i < NumDefs) { ArgInit = OutDI->getArg(i); @@ -66,11 +66,11 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) { ArgName = InDI->getArgNameStr(i - NumDefs); } - DagInit *SubArgDag = dyn_cast(ArgInit); + const DagInit *SubArgDag = dyn_cast(ArgInit); if (SubArgDag) ArgInit = SubArgDag->getOperator(); - DefInit *Arg = dyn_cast(ArgInit); + const DefInit *Arg = dyn_cast(ArgInit); if (!Arg) PrintFatalError(R->getLoc(), "Illegal operand for the '" + R->getName() + "' instruction!"); @@ -81,7 +81,7 @@ CGIOperandList::CGIOperandList(const Record *R) : TheDef(R) { std::string OperandType = "OPERAND_UNKNOWN"; std::string OperandNamespace = "MCOI"; unsigned NumOps = 1; - DagInit *MIOpInfo = nullptr; + const DagInit *MIOpInfo = nullptr; if (Rec->isSubClassOf("RegisterOperand")) { PrintMethod = std::string(Rec->getValueAsString("PrintMethod")); OperandType = std::string(Rec->getValueAsString("OperandType")); @@ -280,7 +280,7 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { } // Find the suboperand number involved. - DagInit *MIOpInfo = OperandList[OpIdx].MIOperandInfo; + const DagInit *MIOpInfo = OperandList[OpIdx].MIOperandInfo; if (!MIOpInfo) PrintFatalError(TheDef->getLoc(), TheDef->getName() + ": unknown suboperand name in '" + @@ -581,11 +581,11 @@ std::string CodeGenInstruction::FlattenAsmStringVariants(StringRef Cur, bool CodeGenInstruction::isOperandImpl(StringRef OpListName, unsigned i, StringRef PropertyName) const { - DagInit *ConstraintList = TheDef->getValueAsDag(OpListName); + const DagInit *ConstraintList = TheDef->getValueAsDag(OpListName); if (!ConstraintList || i >= ConstraintList->getNumArgs()) return false; - DefInit *Constraint = dyn_cast(ConstraintList->getArg(i)); + const DefInit *Constraint = dyn_cast(ConstraintList->getArg(i)); if (!Constraint) return false; diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h index 18294b157fed..a799d023b1af 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.h +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h @@ -110,7 +110,7 @@ public: /// MIOperandInfo - Default MI operand type. Note an operand may be made /// up of multiple MI operands. - DagInit *MIOperandInfo; + const DagInit *MIOperandInfo; /// Constraint info for this operand. This operand can have pieces, so we /// track constraint info for each. @@ -118,7 +118,7 @@ public: OperandInfo(const Record *R, const std::string &N, const std::string &PMN, const std::string &OT, unsigned MION, unsigned MINO, - DagInit *MIOI) + const DagInit *MIOI) : Rec(R), Name(N), SubOpNames(MINO), PrinterMethodName(PMN), EncoderMethodNames(MINO), OperandType(OT), MIOperandNo(MION), MINumOperands(MINO), DoNotEncode(MINO), MIOperandInfo(MIOI), diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index b53492dafb25..9e1ebf32c464 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -630,7 +630,7 @@ struct TupleExpander : SetTheory::Expander { std::vector Indices = Def->getValueAsListOfDefs("SubRegIndices"); unsigned Dim = Indices.size(); - ListInit *SubRegs = Def->getValueAsListInit("SubRegs"); + const ListInit *SubRegs = Def->getValueAsListInit("SubRegs"); if (Dim != SubRegs->size()) PrintFatalError(Def->getLoc(), "SubRegIndices and SubRegs size mismatch"); if (Dim < 2) @@ -669,11 +669,11 @@ struct TupleExpander : SetTheory::Expander { } // Take the cost list of the first register in the tuple. - ListInit *CostList = Proto->getValueAsListInit("CostPerUse"); - SmallVector CostPerUse; + const ListInit *CostList = Proto->getValueAsListInit("CostPerUse"); + SmallVector CostPerUse; CostPerUse.insert(CostPerUse.end(), CostList->begin(), CostList->end()); - StringInit *AsmName = StringInit::get(RK, ""); + const StringInit *AsmName = StringInit::get(RK, ""); if (!RegNames.empty()) { if (RegNames.size() <= n) PrintFatalError(Def->getLoc(), @@ -776,7 +776,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, // Allocation order 0 is the full set. AltOrders provides others. const SetTheory::RecVec *Elements = RegBank.getSets().expand(R); - ListInit *AltOrders = R->getValueAsListInit("AltOrders"); + const ListInit *AltOrders = R->getValueAsListInit("AltOrders"); Orders.resize(1 + AltOrders->size()); // Default allocation order always contains all registers. @@ -808,7 +808,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Namespace = R->getValueAsString("Namespace"); if (const RecordVal *RV = R->getValue("RegInfos")) - if (DefInit *DI = dyn_cast_or_null(RV->getValue())) + if (const DefInit *DI = dyn_cast_or_null(RV->getValue())) RSI = RegSizeInfoByHwMode(DI->getDef(), RegBank.getHwModes()); unsigned Size = R->getValueAsInt("Size"); assert((RSI.hasDefault() || Size != 0 || VTs[0].isSimple()) && @@ -831,9 +831,9 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, GlobalPriority = R->getValueAsBit("GlobalPriority"); - BitsInit *TSF = R->getValueAsBitsInit("TSFlags"); + const BitsInit *TSF = R->getValueAsBitsInit("TSFlags"); for (unsigned I = 0, E = TSF->getNumBits(); I != E; ++I) { - BitInit *Bit = cast(TSF->getBit(I)); + const BitInit *Bit = cast(TSF->getBit(I)); TSFlags |= uint8_t(Bit->getValue()) << I; } } diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 9c37fbe9c4b4..06d82daebac0 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -86,8 +86,8 @@ struct InstRegexOp : public SetTheory::Operator { auto Pseudos = Instructions.slice(NumGeneric, NumPseudos); auto NonPseudos = Instructions.slice(NumGeneric + NumPseudos); - for (Init *Arg : Expr->getArgs()) { - StringInit *SI = dyn_cast(Arg); + for (const Init *Arg : Expr->getArgs()) { + const StringInit *SI = dyn_cast(Arg); if (!SI) PrintFatalError(Loc, "instregex requires pattern string: " + Expr->getAsString()); @@ -1828,13 +1828,14 @@ void CodeGenSchedModels::collectRegisterFiles() { ConstRecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); - ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination"); + const ListInit *MoveElimInfo = + RF->getValueAsListInit("AllowMoveElimination"); for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1; bool AllowMoveElim = false; if (MoveElimInfo->size() > I) { - BitInit *Val = cast(MoveElimInfo->getElement(I)); + const BitInit *Val = cast(MoveElimInfo->getElement(I)); AllowMoveElim = Val->getValue(); } diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index 9883cf5cf35f..b358518c4290 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -361,16 +361,16 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() { R->getValueAsBit("isPseudo")) continue; - BitsInit *BI = R->getValueAsBitsInit("Inst"); + const BitsInit *BI = R->getValueAsBitsInit("Inst"); unsigned numBits = BI->getNumBits(); - SmallVector NewBits(numBits); + SmallVector NewBits(numBits); for (unsigned bit = 0, end = numBits / 2; bit != end; ++bit) { unsigned bitSwapIdx = numBits - bit - 1; - Init *OrigBit = BI->getBit(bit); - Init *BitSwap = BI->getBit(bitSwapIdx); + const Init *OrigBit = BI->getBit(bit); + const Init *BitSwap = BI->getBit(bitSwapIdx); NewBits[bit] = BitSwap; NewBits[bitSwapIdx] = OrigBit; } @@ -380,7 +380,7 @@ void CodeGenTarget::reverseBitsForLittleEndianEncoding() { } RecordKeeper &MutableRC = const_cast(Records); - BitsInit *NewBI = BitsInit::get(MutableRC, NewBits); + const BitsInit *NewBI = BitsInit::get(MutableRC, NewBits); // Update the bits in reversed order so that emitters will get the correct // endianness. diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp index 9dcc5f43a2b5..364b80c36bac 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp @@ -57,7 +57,7 @@ bool PatternParser::parsePatternList( // The match section consists of a list of matchers and predicates. Parse each // one and add the equivalent GIMatchDag nodes, predicates, and edges. for (unsigned I = 0; I < List.getNumArgs(); ++I) { - Init *Arg = List.getArg(I); + const Init *Arg = List.getArg(I); std::string Name = List.getArgName(I) ? List.getArgName(I)->getValue().str() : ("__" + AnonPatNamePrefix + "_" + Twine(I)).str(); @@ -138,7 +138,7 @@ PatternParser::parseInstructionPattern(const Init &Arg, StringRef Name) { return nullptr; for (unsigned K = 0; K < DagPat->getNumArgs(); ++K) { - Init *Arg = DagPat->getArg(K); + const Init *Arg = DagPat->getArg(K); if (auto *DagArg = getDagWithSpecificOperator(*Arg, "MIFlags")) { if (!parseInstructionPatternMIFlags(*Pat, DagArg)) return nullptr; diff --git a/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp b/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp index 52f7b0fcbd62..0b84a9bbe634 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/Patterns.cpp @@ -382,7 +382,7 @@ bool CodeGenInstructionPattern::hasVariadicDefs() const { if (I.variadicOpsAreDefs) return true; - DagInit *OutOps = I.TheDef->getValueAsDag("OutOperandList"); + const DagInit *OutOps = I.TheDef->getValueAsDag("OutOperandList"); if (OutOps->arg_empty()) return false; diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp index 9b454cf79446..0a835bd7b0bc 100644 --- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp +++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp @@ -100,7 +100,8 @@ public: // Get the name of custom encoder or decoder, if there is any. // Returns `{encoder name, decoder name}`. -static std::pair getCustomCoders(ArrayRef Args) { +static std::pair +getCustomCoders(ArrayRef Args) { std::pair Result; for (const auto *Arg : Args) { const auto *DI = dyn_cast(Arg); @@ -187,8 +188,8 @@ void VarLenInst::buildRec(const DagInit *DI) { PrintFatalError(TheDef->getLoc(), "Expecting at least 3 arguments for `slice`"); HasDynamicSegment = true; - Init *OperandName = DI->getArg(0), *HiBit = DI->getArg(1), - *LoBit = DI->getArg(2); + const Init *OperandName = DI->getArg(0), *HiBit = DI->getArg(1), + *LoBit = DI->getArg(2); if (!isa(OperandName) || !isa(HiBit) || !isa(LoBit)) PrintFatalError(TheDef->getLoc(), "Invalid argument types for `slice`"); @@ -211,7 +212,7 @@ void VarLenInst::buildRec(const DagInit *DI) { if (NeedSwap) { // Normalization: Hi bit should always be the second argument. - Init *const NewArgs[] = {OperandName, LoBit, HiBit}; + const Init *const NewArgs[] = {OperandName, LoBit, HiBit}; Segments.push_back({NumBits, DagInit::get(DI->getOperator(), nullptr, NewArgs, {}), CustomEncoder, CustomDecoder}); @@ -241,7 +242,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) { for (const auto [Mode, EncodingDef] : EBM) { Modes.insert({Mode, "_" + HWM.getMode(Mode).Name.str()}); const RecordVal *RV = EncodingDef->getValue("Inst"); - DagInit *DI = cast(RV->getValue()); + const DagInit *DI = cast(RV->getValue()); VarLenInsts[R].insert({Mode, VarLenInst(DI, RV)}); } continue; diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp index 5ee02f4fbf49..e087ff072663 100644 --- a/llvm/utils/TableGen/CompressInstEmitter.cpp +++ b/llvm/utils/TableGen/CompressInstEmitter.cpp @@ -248,7 +248,8 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec, "' in the corresponding instruction operand!"); OperandMap[I].Kind = OpData::Operand; - } else if (IntInit *II = dyn_cast(Dag->getArg(I - TiedCount))) { + } else if (const IntInit *II = + dyn_cast(Dag->getArg(I - TiedCount))) { // Validate that corresponding instruction operand expects an immediate. if (Inst.Operands[I].Rec->isSubClassOf("RegisterClass")) PrintFatalError( @@ -428,7 +429,7 @@ void CompressInstEmitter::createInstOperandMapping( /// Instruction type and generate a warning. void CompressInstEmitter::evaluateCompressPat(const Record *Rec) { // Validate input Dag operands. - DagInit *SourceDag = Rec->getValueAsDag("Input"); + const DagInit *SourceDag = Rec->getValueAsDag("Input"); assert(SourceDag && "Missing 'Input' in compress pattern!"); LLVM_DEBUG(dbgs() << "Input: " << *SourceDag << "\n"); @@ -438,7 +439,7 @@ void CompressInstEmitter::evaluateCompressPat(const Record *Rec) { verifyDagOpCount(SourceInst, SourceDag, true); // Validate output Dag operands. - DagInit *DestDag = Rec->getValueAsDag("Output"); + const DagInit *DestDag = Rec->getValueAsDag("Output"); assert(DestDag && "Missing 'Output' in compress pattern!"); LLVM_DEBUG(dbgs() << "Output: " << *DestDag << "\n"); diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 31c46d5fcbd0..09c1ee4fd0f3 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -844,7 +844,7 @@ void MatcherGen::EmitResultInstructionAsOperand( // children may themselves emit multiple MI operands. unsigned NumSubOps = 1; if (OperandNode->isSubClassOf("Operand")) { - DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo"); + const DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo"); if (unsigned NumArgs = MIOpInfo->getNumArgs()) NumSubOps = NumArgs; } diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp index 7d274a1cf632..264cccf6ac0c 100644 --- a/llvm/utils/TableGen/DFAEmitter.cpp +++ b/llvm/utils/TableGen/DFAEmitter.cpp @@ -306,7 +306,7 @@ StringRef Automaton::getActionSymbolType(StringRef A) { } Transition::Transition(const Record *R, Automaton *Parent) { - BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); + const BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); NewState = 0; assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && "State cannot be represented in 64 bits!"); diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 06bf7a0c0a83..0598baea9be7 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -160,7 +160,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { const RecordVal *RV = R->getValue("LLVMIntrinsic"); if (RV && RV->getValue()) { - if (DefInit *DI = dyn_cast(RV->getValue())) { + if (const DefInit *DI = dyn_cast(RV->getValue())) { auto *IntrinsicDef = DI->getDef(); auto DefName = IntrinsicDef->getName(); assert(DefName.starts_with("int_") && "invalid intrinsic name"); diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index d4f4e3fa684c..4d2320b31ea9 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -208,7 +208,7 @@ static int Value(bit_value_t V) { } static bit_value_t bitFromBits(const BitsInit &bits, unsigned index) { - if (BitInit *bit = dyn_cast(bits.getBit(index))) + if (const BitInit *bit = dyn_cast(bits.getBit(index))) return bit->getValue() ? BIT_TRUE : BIT_FALSE; // The bit is uninitialized. @@ -234,14 +234,14 @@ static void dumpBits(raw_ostream &OS, const BitsInit &bits) { } } -static BitsInit &getBitsField(const Record &def, StringRef str) { +static const BitsInit &getBitsField(const Record &def, StringRef str) { const RecordVal *RV = def.getValue(str); - if (BitsInit *Bits = dyn_cast(RV->getValue())) + if (const BitsInit *Bits = dyn_cast(RV->getValue())) return *Bits; // variable length instruction VarLenInst VLI = VarLenInst(cast(RV->getValue()), RV); - SmallVector Bits; + SmallVector Bits; for (const auto &SI : VLI) { if (const BitsInit *BI = dyn_cast(SI.Value)) { @@ -459,7 +459,7 @@ protected: // Populates the insn given the uid. void insnWithID(insn_t &Insn, unsigned Opcode) const { const Record *EncodingDef = AllInstructions[Opcode].EncodingDef; - BitsInit &Bits = getBitsField(*EncodingDef, "Inst"); + const BitsInit &Bits = getBitsField(*EncodingDef, "Inst"); Insn.resize(std::max(BitWidth, Bits.getNumBits()), BIT_UNSET); // We may have a SoftFail bitmask, which specifies a mask where an encoding // may differ from the value in "Inst" and yet still be valid, but the @@ -1290,7 +1290,7 @@ bool FilterChooser::emitPredicateMatchAux(const Init &Val, bool ParenIfBinOp, } bool FilterChooser::emitPredicateMatch(raw_ostream &OS, unsigned Opc) const { - ListInit *Predicates = + const ListInit *Predicates = AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates"); bool IsFirstEmission = true; for (unsigned i = 0; i < Predicates->size(); ++i) { @@ -1374,11 +1374,11 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo, unsigned Opc) const { const Record *EncodingDef = AllInstructions[Opc].EncodingDef; const RecordVal *RV = EncodingDef->getValue("SoftFail"); - BitsInit *SFBits = RV ? dyn_cast(RV->getValue()) : nullptr; + const BitsInit *SFBits = RV ? dyn_cast(RV->getValue()) : nullptr; if (!SFBits) return; - BitsInit *InstBits = EncodingDef->getValueAsBitsInit("Inst"); + const BitsInit *InstBits = EncodingDef->getValueAsBitsInit("Inst"); APInt PositiveMask(BitWidth, 0ULL); APInt NegativeMask(BitWidth, 0ULL); @@ -1886,7 +1886,7 @@ OperandInfo getOpInfo(const Record *TypeRecord) { const RecordVal *HasCompleteDecoderVal = TypeRecord->getValue("hasCompleteDecoder"); - BitInit *HasCompleteDecoderBit = + const BitInit *HasCompleteDecoderBit = HasCompleteDecoderVal ? dyn_cast(HasCompleteDecoderVal->getValue()) : nullptr; @@ -1976,10 +1976,10 @@ static void addOneOperandFields(const Record &EncodingDef, const BitsInit &Bits, OpInfo.InitValue |= 1ULL << I; for (unsigned I = 0, J = 0; I != Bits.getNumBits(); I = J) { - VarInit *Var; + const VarInit *Var; unsigned Offset = 0; for (; J != Bits.getNumBits(); ++J) { - VarBitInit *BJ = dyn_cast(Bits.getBit(J)); + const VarBitInit *BJ = dyn_cast(Bits.getBit(J)); if (BJ) { Var = dyn_cast(BJ->getBitVar()); if (I == J) @@ -2010,7 +2010,7 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef, // We are bound to fail! For proper disassembly, the well-known encoding bits // of the instruction must be fully specified. - BitsInit &Bits = getBitsField(EncodingDef, "Inst"); + const BitsInit &Bits = getBitsField(EncodingDef, "Inst"); if (Bits.allInComplete()) return 0; @@ -2035,9 +2035,9 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef, // Gather the outputs/inputs of the instruction, so we can find their // positions in the encoding. This assumes for now that they appear in the // MCInst in the order that they're listed. - std::vector> InOutOperands; - DagInit *Out = Def.getValueAsDag("OutOperandList"); - DagInit *In = Def.getValueAsDag("InOperandList"); + std::vector> InOutOperands; + const DagInit *Out = Def.getValueAsDag("OutOperandList"); + const DagInit *In = Def.getValueAsDag("InOperandList"); for (const auto &[Idx, Arg] : enumerate(Out->getArgs())) InOutOperands.push_back(std::pair(Arg, Out->getArgNameStr(Idx))); for (const auto &[Idx, Arg] : enumerate(In->getArgs())) @@ -2069,7 +2069,7 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef, } else { // For each operand, see if we can figure out where it is encoded. for (const auto &Op : InOutOperands) { - Init *OpInit = Op.first; + const Init *OpInit = Op.first; StringRef OpName = Op.second; // We're ready to find the instruction encoding locations for this @@ -2077,7 +2077,7 @@ populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef, // First, find the operand type ("OpInit"), and sub-op names // ("SubArgDag") if present. - DagInit *SubArgDag = dyn_cast(OpInit); + const DagInit *SubArgDag = dyn_cast(OpInit); if (SubArgDag) OpInit = SubArgDag->getOperator(); const Record *OpTypeRec = cast(OpInit)->getDef(); @@ -2521,7 +2521,7 @@ namespace llvm { for (const auto &NumberedInstruction : NumberedInstructions) { const Record *InstDef = NumberedInstruction->TheDef; if (const RecordVal *RV = InstDef->getValue("EncodingInfos")) { - if (DefInit *DI = dyn_cast_or_null(RV->getValue())) { + if (const DefInit *DI = dyn_cast_or_null(RV->getValue())) { EncodingInfoByHwMode EBM(DI->getDef(), HWM); for (auto &[ModeId, Encoding] : EBM) { // DecoderTables with DefaultMode should not have any suffix. diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 2524a443f345..424f1ccb067f 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -1375,7 +1375,7 @@ bool CombineRuleBuilder::addFeaturePredicates(RuleMatcher &M) { if (!RuleDef.getValue("Predicates")) return true; - ListInit *Preds = RuleDef.getValueAsListInit("Predicates"); + const ListInit *Preds = RuleDef.getValueAsListInit("Predicates"); for (const Init *PI : Preds->getValues()) { const DefInit *Pred = dyn_cast(PI); if (!Pred) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 29c64ba95ff8..e866bd983e04 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -579,8 +579,8 @@ Expected GlobalISelEmitter::addBuiltinPredicates( if (const ListInit *AddrSpaces = Predicate.getAddressSpaces()) { SmallVector ParsedAddrSpaces; - for (Init *Val : AddrSpaces->getValues()) { - IntInit *IntVal = dyn_cast(Val); + for (const Init *Val : AddrSpaces->getValues()) { + const IntInit *IntVal = dyn_cast(Val); if (!IntVal) return failedImport("Address space is not an integer"); ParsedAddrSpaces.push_back(IntVal->getValue()); diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index a7039ff7e31e..8c0e27215a73 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -411,7 +411,7 @@ void InstrInfoEmitter::emitOperandTypeMappings( OperandRecords.push_back(Op.Rec); ++CurrentOffset; } else { - for (Init *Arg : MIOI->getArgs()) { + for (const Init *Arg : MIOI->getArgs()) { OperandRecords.push_back(cast(Arg)->getDef()); ++CurrentOffset; } @@ -1296,7 +1296,7 @@ void InstrInfoEmitter::emitRecord( OS << "|(1ULL<getValueAsBitsInit("TSFlags"); + const BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags"); if (!TSF) PrintFatalError(Inst.TheDef->getLoc(), "no TSFlags?"); uint64_t Value = 0; diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp index 424cf16e719d..2872762cc7fd 100644 --- a/llvm/utils/TableGen/OptionParserEmitter.cpp +++ b/llvm/utils/TableGen/OptionParserEmitter.cpp @@ -433,10 +433,10 @@ static void EmitOptionParser(const RecordKeeper &Records, raw_ostream &OS) { OS << ", "; int NumFlags = 0; const ListInit *LI = R.getValueAsListInit("Flags"); - for (Init *I : *LI) + for (const Init *I : *LI) OS << (NumFlags++ ? " | " : "") << cast(I)->getDef()->getName(); if (GroupFlags) { - for (Init *I : *GroupFlags) + for (const Init *I : *GroupFlags) OS << (NumFlags++ ? " | " : "") << cast(I)->getDef()->getName(); } @@ -447,11 +447,11 @@ static void EmitOptionParser(const RecordKeeper &Records, raw_ostream &OS) { OS << ", "; int NumVisFlags = 0; LI = R.getValueAsListInit("Visibility"); - for (Init *I : *LI) + for (const Init *I : *LI) OS << (NumVisFlags++ ? " | " : "") << cast(I)->getDef()->getName(); if (GroupVis) { - for (Init *I : *GroupVis) + for (const Init *I : *GroupVis) OS << (NumVisFlags++ ? " | " : "") << cast(I)->getDef()->getName(); } @@ -473,7 +473,7 @@ static void EmitOptionParser(const RecordKeeper &Records, raw_ostream &OS) { HelpTextsForVariants; for (const Record *VisibilityHelp : R.getValueAsListOfDefs("HelpTextsForVariants")) { - ArrayRef Visibilities = + ArrayRef Visibilities = VisibilityHelp->getValueAsListInit("Visibilities")->getValues(); std::vector VisibilityNames; diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 371ee75d1b49..be2a2b3884c7 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -445,7 +445,7 @@ void RegisterInfoEmitter::EmitRegMappingTables( if (!V || !V->getValue()) continue; - DefInit *DI = cast(V->getValue()); + const DefInit *DI = cast(V->getValue()); const Record *Alias = DI->getDef(); const auto &AliasIter = llvm::lower_bound( DwarfRegNums, Alias, [](const DwarfRegNumsMapPair &A, const Record *B) { @@ -1061,10 +1061,10 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS) { OS << " 0,\n"; for (const auto &RE : Regs) { const Record *Reg = RE.TheDef; - BitsInit *BI = Reg->getValueAsBitsInit("HWEncoding"); + const BitsInit *BI = Reg->getValueAsBitsInit("HWEncoding"); uint64_t Value = 0; for (unsigned b = 0, be = BI->getNumBits(); b != be; ++b) { - if (BitInit *B = dyn_cast(BI->getBit(b))) + if (const BitInit *B = dyn_cast(BI->getBit(b))) Value |= (uint64_t)B->getValue() << b; } OS << " " << Value << ",\n"; diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp index d6cb94cdff24..4bf4df692acb 100644 --- a/llvm/utils/TableGen/SearchableTableEmitter.cpp +++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp @@ -196,7 +196,7 @@ private: bool IsPrimary, raw_ostream &OS); void emitIfdef(StringRef Guard, raw_ostream &OS); - bool parseFieldType(GenericField &Field, Init *II); + bool parseFieldType(GenericField &Field, const Init *II); std::unique_ptr parseSearchIndex(GenericTable &Table, const RecordVal *RecVal, StringRef Name, ArrayRef Key, bool EarlyOut, bool ReturnRange); @@ -233,8 +233,8 @@ int64_t SearchableTableEmitter::getNumericKey(const SearchIndex &Index, bool SearchableTableEmitter::compareBy(const Record *LHS, const Record *RHS, const SearchIndex &Index) { for (const auto &Field : Index.Fields) { - Init *LHSI = LHS->getValueInit(Field.Name); - Init *RHSI = RHS->getValueInit(Field.Name); + const Init *LHSI = LHS->getValueInit(Field.Name); + const Init *RHSI = RHS->getValueInit(Field.Name); if (isa(Field.RecType) || isa(Field.RecType)) { int64_t LHSi = getAsInt(LHSI); @@ -574,7 +574,8 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table, OS << "#endif\n\n"; } -bool SearchableTableEmitter::parseFieldType(GenericField &Field, Init *TypeOf) { +bool SearchableTableEmitter::parseFieldType(GenericField &Field, + const Init *TypeOf) { auto Type = dyn_cast(TypeOf); if (!Type) return false; diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 8ab7bdcd2214..bcc5712b9154 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -251,7 +251,7 @@ static uint8_t byteFromBitsInit(const BitsInit *B) { uint8_t Value = 0; for (unsigned I = 0; I != N; ++I) { - BitInit *Bit = cast(B->getBit(I)); + const BitInit *Bit = cast(B->getBit(I)); Value |= Bit->getValue() << I; } return Value; @@ -487,7 +487,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table, uint8_t Enc = byteFromBitsInit(RegRec->getValueAsBitsInit("OpEncBits")); if (isExplicitAlign(RegInst)) { // The instruction require explicitly aligned memory. - BitsInit *VectSize = RegRec->getValueAsBitsInit("VectSize"); + const BitsInit *VectSize = RegRec->getValueAsBitsInit("VectSize"); Result.Alignment = Align(byteFromBitsInit(VectSize)); } else if (!Enc && !isExplicitUnalign(RegInst) && getMemOperandSize(MemOpRec) > 64) { @@ -512,7 +512,7 @@ void X86FoldTablesEmitter::addBroadcastEntry( assert(Table.find(RegInst) == Table.end() && "Override entry unexpectedly"); X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst); - DagInit *In = MemInst->TheDef->getValueAsDag("InOperandList"); + const DagInit *In = MemInst->TheDef->getValueAsDag("InOperandList"); for (unsigned I = 0, E = In->getNumArgs(); I != E; ++I) { Result.BroadcastKind = StringSwitch(In->getArg(I)->getAsString()) diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index 47df5bf0df8e..10fab469a080 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -112,7 +112,7 @@ static uint8_t byteFromBitsInit(const BitsInit *B) { uint8_t Value = 0; for (unsigned I = 0; I != N; ++I) { - BitInit *Bit = cast(B->getBit(I)); + const BitInit *Bit = cast(B->getBit(I)); Value |= Bit->getValue() << I; } return Value; diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 60fc1d1ecbfa..26b881651ea4 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -77,17 +77,15 @@ unsigned X86Disassembler::getMemOperandSize(const Record *MemRec) { /// @param init - A reference to the BitsInit to be decoded. /// @return - The field, with the first bit in the BitsInit as the lowest /// order bit. -static uint8_t byteFromBitsInit(BitsInit &init) { +static uint8_t byteFromBitsInit(const BitsInit &init) { int width = init.getNumBits(); assert(width <= 8 && "Field is too large for uint8_t!"); - int index; uint8_t mask = 0x01; - uint8_t ret = 0; - for (index = 0; index < width; index++) { + for (int index = 0; index < width; index++) { if (cast(init.getBit(index))->getValue()) ret |= mask; @@ -104,7 +102,7 @@ static uint8_t byteFromBitsInit(BitsInit &init) { /// @param name - The name of the field in the record. /// @return - The field, as translated by byteFromBitsInit(). static uint8_t byteFromRec(const Record *rec, StringRef name) { - BitsInit *bits = rec->getValueAsBitsInit(name); + const BitsInit *bits = rec->getValueAsBitsInit(name); return byteFromBitsInit(*bits); } diff --git a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp index 6a3d5a25e28c..d7967c7a7753 100644 --- a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp +++ b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp @@ -258,8 +258,7 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType, SmallVector argNames; if (def->isSubClassOf("CompositeBytecode")) { const DagInit *members = def->getValueAsDag("members"); - args = llvm::to_vector(map_range( - members->getArgs(), [](Init *init) { return (const Init *)init; })); + args = llvm::to_vector(members->getArgs()); argNames = llvm::to_vector( map_range(members->getArgNames(), [](const StringInit *init) { return init->getAsUnquotedString(); -- GitLab From 922992a22f7c87c192cf96606038df3cf20d6404 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 18 Oct 2024 15:58:33 +0100 Subject: [PATCH 068/511] Fix typo "instrinsic" (#112899) --- clang/utils/TableGen/RISCVVEmitter.cpp | 4 ++-- flang/docs/OptionComparison.md | 2 +- flang/include/flang/Runtime/magic-numbers.h | 2 +- flang/lib/Evaluate/intrinsics.cpp | 2 +- flang/lib/Optimizer/Builder/Runtime/Numeric.cpp | 6 +++--- flang/lib/Optimizer/Builder/Runtime/Reduction.cpp | 2 +- lldb/CMakeLists.txt | 2 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +- llvm/include/llvm/Transforms/Utils/SSAUpdater.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 2 +- llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 2 +- llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll | 2 +- llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll | 2 +- llvm/test/Transforms/JumpThreading/thread-debug-info.ll | 2 +- llvm/test/Transforms/SROA/fake-use-sroa.ll | 2 +- llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp | 2 +- mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp | 2 +- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 2 +- 19 files changed, 22 insertions(+), 22 deletions(-) diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp index 50f161fd38ce..aecca0f5df8d 100644 --- a/clang/utils/TableGen/RISCVVEmitter.cpp +++ b/clang/utils/TableGen/RISCVVEmitter.cpp @@ -169,7 +169,7 @@ static VectorTypeModifier getTupleVTM(unsigned NF) { static unsigned getIndexedLoadStorePtrIdx(const RVVIntrinsic *RVVI) { // We need a special rule for segment load/store since the data width is not - // encoded in the instrinsic name itself. + // encoded in the intrinsic name itself. const StringRef IRName = RVVI->getIRName(); constexpr unsigned RVV_VTA = 0x1; constexpr unsigned RVV_VMA = 0x2; @@ -192,7 +192,7 @@ static unsigned getIndexedLoadStorePtrIdx(const RVVIntrinsic *RVVI) { static unsigned getSegInstLog2SEW(StringRef InstName) { // clang-format off // We need a special rule for indexed segment load/store since the data width - // is not encoded in the instrinsic name itself. + // is not encoded in the intrinsic name itself. if (InstName.starts_with("vloxseg") || InstName.starts_with("vluxseg") || InstName.starts_with("vsoxseg") || InstName.starts_with("vsuxseg")) return (unsigned)-1; diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md index 9d6916ef62af..fb65498fa1f4 100644 --- a/flang/docs/OptionComparison.md +++ b/flang/docs/OptionComparison.md @@ -53,7 +53,7 @@ eN fdec,

-fall-instrinsics +fall-intrinsics qxlf77,

diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h index bab0e9ae0529..1d3c5dca0b4b 100644 --- a/flang/include/flang/Runtime/magic-numbers.h +++ b/flang/include/flang/Runtime/magic-numbers.h @@ -107,7 +107,7 @@ The denorm value is a nonstandard extension. #if 0 ieee_round_type values -The values are those of the llvm.get.rounding instrinsic, which is assumed by +The values are those of the llvm.get.rounding intrinsic, which is assumed by ieee_arithmetic module rounding procedures. #endif #define _FORTRAN_RUNTIME_IEEE_TO_ZERO 0 diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 4271faa0db12..aa4496781772 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -1690,7 +1690,7 @@ std::optional IntrinsicInterface::Match( // MAX and MIN (and others that map to them) allow their last argument to // be repeated indefinitely. The actualForDummy vector is sized // and null-initialized to the non-repeated dummy argument count - // for other instrinsics. + // for other intrinsics. bool isMaxMin{dummyArgPatterns > 0 && dummy[dummyArgPatterns - 1].optionality == Optionality::repeats}; std::vector actualForDummy( diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp index c13064a284d1..d0092add0118 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp @@ -284,7 +284,7 @@ struct ForcedSpacing16 { } }; -/// Generate call to Exponent instrinsic runtime routine. +/// Generate call to Exponent intrinsic runtime routine. mlir::Value fir::runtime::genExponent(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type resultType, mlir::Value x) { @@ -320,7 +320,7 @@ mlir::Value fir::runtime::genExponent(fir::FirOpBuilder &builder, return builder.create(loc, func, args).getResult(0); } -/// Generate call to Fraction instrinsic runtime routine. +/// Generate call to Fraction intrinsic runtime routine. mlir::Value fir::runtime::genFraction(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value x) { mlir::func::FuncOp func; @@ -596,7 +596,7 @@ mlir::Value fir::runtime::genSelectedRealKind(fir::FirOpBuilder &builder, return builder.create(loc, func, args).getResult(0); } -/// Generate call to Set_exponent instrinsic runtime routine. +/// Generate call to Set_exponent intrinsic runtime routine. mlir::Value fir::runtime::genSetExponent(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value x, mlir::Value i) { diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp index f6627dff671e..b768733bd2fd 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp @@ -1513,7 +1513,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc, // The IAll, IAny and IParity intrinsics have essentially the same // implementation. This macro will generate the function body given the -// instrinsic name. +// intrinsic name. #define GEN_IALL_IANY_IPARITY(F) \ mlir::Value fir::runtime::JOIN2(gen, F)( \ fir::FirOpBuilder & builder, mlir::Location loc, mlir::Value arrayBox, \ diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index 59cdc4593463..5827e04b5662 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -97,7 +97,7 @@ if (LLDB_ENABLE_PYTHON OR LLDB_ENABLE_LUA) add_subdirectory(bindings) endif () -# We need the headers generated by instrinsics_gen before we can compile +# We need the headers generated by intrinsics_gen before we can compile # any source file in LLDB as the imported Clang modules might include # some of these generated headers. This approach is copied from Clang's main # CMakeLists.txt, so it should kept in sync the code in Clang which was added diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 715f2cc917e2..92226a687cad 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1115,7 +1115,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < // it is const 0. A struct intrinsic with constant 0 index is different to the // corresponding raw intrinsic on gfx9+ because the behavior of bound checking // and swizzling changes depending on whether idxen is set in the instruction. -// These instrinsics also keep the offset and soffset arguments separate as +// These intrinsics also keep the offset and soffset arguments separate as // they behave differently in bounds checking and swizzling. // The versions of these intrinsics that take <4 x i32> arguments are deprecated diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h index 29d96a0ab6bf..73649766a953 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -118,7 +118,7 @@ public: /// Rewrite debug value intrinsics to conform to a new SSA form. /// - /// This will scout out all the debug value instrinsics associated with + /// This will scout out all the debug value intrinsics associated with /// the instruction. Anything outside of its block will have its /// value set to the new SSA value if available, and undef if not. void UpdateDebugValues(Instruction *I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index ee5e75955cd4..e4ca1ae0499b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -580,7 +580,7 @@ std::pair AMDGPUAtomicOptimizerImpl::buildScanIteratively( auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); ActiveBits->addIncoming(Ballot, EntryBB); - // Use llvm.cttz instrinsic to find the lowest remaining active lane. + // Use llvm.cttz intrinsic to find the lowest remaining active lane. auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 68182d238e78..6b308bc8c9aa 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -50,7 +50,7 @@ /// each of the preceding fields which are relevant for a given instruction /// in the opcode space. /// -/// Currently, the policy is represented via the following instrinsic families: +/// Currently, the policy is represented via the following intrinsic families: /// * _MASK - Can represent all three policy states for both tail and mask. If /// passthrough is IMPLICIT_DEF (or NoReg), then represents "undefined". /// Otherwise, policy operand and tablegen flags drive the interpretation. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 60b3294b5f0b..2c0543842a82 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -708,7 +708,7 @@ defm "" : ReplaceLane; defm "" : ReplaceLane; defm "" : ReplaceLane; -// For now use an instrinsic for f16x8.replace_lane instead of ReplaceLane above +// For now use an intrinsic for f16x8.replace_lane instead of ReplaceLane above // since LLVM IR generated with half type arguments is not well supported and // creates conversions from f16->f32. defm REPLACE_LANE_F16x8 : diff --git a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll index a2d171c17308..a97ed0a9851e 100644 --- a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll +++ b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll @@ -54,7 +54,7 @@ define @ld2.nxv32i8_no_eltty( %Pg, i8 *%bas ret %res } -; ldN instrinsic name with only output type +; ldN intrinsic name with only output type define @ld2.nxv32i8_no_predty_pty( %Pg, i8 *%base_ptr) { ; CHECK-LABEL: @ld2.nxv32i8_no_predty_pty ; CHECK: %1 = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( %Pg, ptr %base_ptr) diff --git a/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll b/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll index 56b151d7f941..2762b8d3455c 100644 --- a/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-reduce-add-01.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; Test vector add reduction instrinsic +; Test vector add reduction intrinsic ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s diff --git a/llvm/test/Transforms/JumpThreading/thread-debug-info.ll b/llvm/test/Transforms/JumpThreading/thread-debug-info.ll index cd7b0b1c05a8..4727413b35a6 100644 --- a/llvm/test/Transforms/JumpThreading/thread-debug-info.ll +++ b/llvm/test/Transforms/JumpThreading/thread-debug-info.ll @@ -50,7 +50,7 @@ exit: ; preds = %bb.f4, %bb.f3, %bb. ret void, !dbg !29 } -; This is testing for debug value instrinsics outside of the threaded block pointing to a value +; This is testing for debug value intrinsics outside of the threaded block pointing to a value ; inside to correctly take any new definitions. define void @test2(i32 %cond1, i32 %cond2) !dbg !5 { ; CHECK: bb.f3 diff --git a/llvm/test/Transforms/SROA/fake-use-sroa.ll b/llvm/test/Transforms/SROA/fake-use-sroa.ll index 9e92df154875..42b0cbb3b0df 100644 --- a/llvm/test/Transforms/SROA/fake-use-sroa.ll +++ b/llvm/test/Transforms/SROA/fake-use-sroa.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -passes=sroa %s | FileCheck %s -; With fake use instrinsics generated for small aggregates, check that when +; With fake use intrinsics generated for small aggregates, check that when ; SROA slices the aggregate, we generate individual fake use intrinsics for ; the individual values. diff --git a/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp b/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp index 3ce85f5d7be2..8fe74e34fe44 100644 --- a/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp +++ b/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp @@ -487,7 +487,7 @@ TEST(RandomIRBuilderTest, findSourceAndSink) { ASSERT_TRUE(DT.dominates(Insts[IP - 1], Sink)); } } -TEST(RandomIRBuilderTest, sinkToInstrinsic) { +TEST(RandomIRBuilderTest, sinkToIntrinsic) { const char *Source = "\n\ declare double @llvm.sqrt.f64(double %Val) \n\ declare void @llvm.ubsantrap(i8 immarg) cold noreturn nounwind \n\ diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index a150e2be1173..984af50a7b0a 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -678,7 +678,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc, vectorOperand, fmf); } -/// Overloaded methods to lower a *predicated* reduction to an llvm instrinsic +/// Overloaded methods to lower a *predicated* reduction to an llvm intrinsic /// that requires a start value. This start value format spans across fp /// reductions without mask and all the masked reduction intrinsics. template diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 6e97b2a50af8..1f63519373ec 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -2180,7 +2180,7 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr, return emitError(loc) << "failed to convert a debug intrinsic operand: " << diag(*dbgIntr); - // Ensure that the debug instrinsic is inserted right after its operand is + // Ensure that the debug intrinsic is inserted right after its operand is // defined. Otherwise, the operand might not necessarily dominate the // intrinsic. If the defining operation is a terminator, insert the intrinsic // into a dominated block. -- GitLab From 00d30bd61e5f9a3c5658005ff0f74fcfef7b0c7e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 18 Oct 2024 10:09:10 -0500 Subject: [PATCH 069/511] [libc] Commit document formatting change someone left --- libc/docs/configure.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index e225e6b566df..3db750b1aed2 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -33,7 +33,7 @@ to learn about the defaults for your platform and target. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** - - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: Set the specific exp value for Inf/NaN inputs. + - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configue an explicit exp value for Inf/NaN inputs. - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST. * **"printf" options** - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends. -- GitLab From 54566ba52304beede0d80851c0202c2dcf7a03ec Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 18 Oct 2024 08:14:14 -0700 Subject: [PATCH 070/511] [SandboxIR] Implement Operator (#112805) This patch implements sandboxir::Operator mirroring llvm::Operator. --- llvm/include/llvm/SandboxIR/Operator.h | 60 ++++++++++++++++ llvm/include/llvm/SandboxIR/Value.h | 4 ++ llvm/unittests/SandboxIR/CMakeLists.txt | 1 + llvm/unittests/SandboxIR/OperatorTest.cpp | 88 +++++++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 llvm/include/llvm/SandboxIR/Operator.h create mode 100644 llvm/unittests/SandboxIR/OperatorTest.cpp diff --git a/llvm/include/llvm/SandboxIR/Operator.h b/llvm/include/llvm/SandboxIR/Operator.h new file mode 100644 index 000000000000..95c450807191 --- /dev/null +++ b/llvm/include/llvm/SandboxIR/Operator.h @@ -0,0 +1,60 @@ +//===- Operator.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SANDBOXIR_OPERATOR_H +#define LLVM_SANDBOXIR_OPERATOR_H + +#include "llvm/IR/Operator.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/SandboxIR/User.h" + +namespace llvm::sandboxir { + +class Operator : public User { +public: + // The Operator class is intended to be used as a utility, and is never itself + // instantiated. + Operator() = delete; + void *operator new(size_t s) = delete; + + static bool classof(const Instruction *) { return true; } + static bool classof(const ConstantExpr *) { return true; } + static bool classof(const Value *From) { + return llvm::Operator::classof(From->Val); + } + bool hasPoisonGeneratingFlags() const { + return cast(Val)->hasPoisonGeneratingFlags(); + } +}; + +class OverflowingBinaryOperator : public Operator { +public: + bool hasNoUnsignedWrap() const { + return cast(Val)->hasNoUnsignedWrap(); + } + bool hasNoSignedWrap() const { + return cast(Val)->hasNoSignedWrap(); + } + unsigned getNoWrapKind() const { + return cast(Val)->getNoWrapKind(); + } + static bool classof(const Instruction *From) { + return llvm::OverflowingBinaryOperator::classof( + cast(From->Val)); + } + static bool classof(const ConstantExpr *From) { + return llvm::OverflowingBinaryOperator::classof( + cast(From->Val)); + } + static bool classof(const Value *From) { + return llvm::OverflowingBinaryOperator::classof(From->Val); + } +}; +} // namespace llvm::sandboxir + +#endif // LLVM_SANDBOXIR_OPERATOR_H diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 3509f2a8d836..58088684bf18 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -28,6 +28,8 @@ class Module; class UnaryInstruction; class CmpInst; class IntrinsicInst; +class Operator; +class OverflowingBinaryOperator; /// Iterator for the `Use` edges of a Value's users. /// \Returns a `Use` when dereferenced. @@ -158,6 +160,8 @@ protected: friend class Utils; // For `Val`. friend class Module; // For `Val`. friend class IntrinsicInst; // For `Val`. + friend class Operator; // For `Val`. + friend class OverflowingBinaryOperator; // For `Val`. // Region needs to manipulate metadata in the underlying LLVM Value, we don't // expose metadata in sandboxir. friend class Region; diff --git a/llvm/unittests/SandboxIR/CMakeLists.txt b/llvm/unittests/SandboxIR/CMakeLists.txt index 1e83bda7a1f6..b20ef829ed0c 100644 --- a/llvm/unittests/SandboxIR/CMakeLists.txt +++ b/llvm/unittests/SandboxIR/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_unittest(SandboxIRTests IntrinsicInstTest.cpp PassTest.cpp RegionTest.cpp + OperatorTest.cpp SandboxIRTest.cpp TrackerTest.cpp TypesTest.cpp diff --git a/llvm/unittests/SandboxIR/OperatorTest.cpp b/llvm/unittests/SandboxIR/OperatorTest.cpp new file mode 100644 index 000000000000..031e2adf4069 --- /dev/null +++ b/llvm/unittests/SandboxIR/OperatorTest.cpp @@ -0,0 +1,88 @@ +//===- OperatorTest.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/SandboxIR/Operator.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Function.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/SandboxIR/Module.h" +#include "llvm/SandboxIR/Value.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +struct OperatorTest : public testing::Test { + LLVMContext C; + std::unique_ptr M; + + void parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + M = parseAssemblyString(IR, Err, C); + if (!M) + Err.print("OperatorTest", errs()); + } + BasicBlock *getBasicBlockByName(Function &F, StringRef Name) { + for (BasicBlock &BB : F) + if (BB.getName() == Name) + return &BB; + llvm_unreachable("Expected to find basic block!"); + } +}; + +TEST_F(OperatorTest, Operator) { + parseIR(C, R"IR( +define void @foo(i8 %v1) { + %add0 = add i8 %v1, 42 + %add1 = add nuw i8 %v1, 42 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *OperatorI0 = cast(&*It++); + auto *OperatorI1 = cast(&*It++); + EXPECT_FALSE(OperatorI0->hasPoisonGeneratingFlags()); + EXPECT_TRUE(OperatorI1->hasPoisonGeneratingFlags()); +} + +TEST_F(OperatorTest, OverflowingBinaryOperator) { + parseIR(C, R"IR( +define void @foo(i8 %v1) { + %add = add i8 %v1, 42 + %addNSW = add nsw i8 %v1, 42 + %addNUW = add nuw i8 %v1, 42 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Add = cast(&*It++); + auto *AddNSW = cast(&*It++); + auto *AddNUW = cast(&*It++); + EXPECT_FALSE(Add->hasNoUnsignedWrap()); + EXPECT_FALSE(Add->hasNoSignedWrap()); + EXPECT_EQ(Add->getNoWrapKind(), llvm::OverflowingBinaryOperator::AnyWrap); + + EXPECT_FALSE(AddNSW->hasNoUnsignedWrap()); + EXPECT_TRUE(AddNSW->hasNoSignedWrap()); + EXPECT_EQ(AddNSW->getNoWrapKind(), + llvm::OverflowingBinaryOperator::NoSignedWrap); + + EXPECT_TRUE(AddNUW->hasNoUnsignedWrap()); + EXPECT_FALSE(AddNUW->hasNoSignedWrap()); + EXPECT_EQ(AddNUW->getNoWrapKind(), + llvm::OverflowingBinaryOperator::NoUnsignedWrap); +} -- GitLab From dbe47c2a06e0928edde802d062ecf1a0ce45fbb9 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 18 Oct 2024 15:14:44 +0000 Subject: [PATCH 071/511] [gn build] Port 54566ba52304 --- llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn index 578b2b18d6ee..5f2bf7ed3646 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn @@ -9,6 +9,7 @@ unittest("SandboxIRTests") { ] sources = [ "IntrinsicInstTest.cpp", + "OperatorTest.cpp", "PassTest.cpp", "RegionTest.cpp", "SandboxIRTest.cpp", -- GitLab From 397707f7188b6df52de1cff85e08e64e3ee5acc3 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Fri, 18 Oct 2024 23:27:12 +0800 Subject: [PATCH 072/511] [libc++] __uglify non-conforming member typedef `base` (#112843) Currently, libc++'s `bitset`, `forward_list`, and `list` have non-conforming member typedef name `base`. The typedef is private, but can cause ambiguity in name lookup. Some other classes in libc++ that are either implementation details or not precisely specified by the standard also have member typdef `base`. I think this can still be conforming. Follows up #80706 and #111127. --- libcxx/docs/ReleaseNotes/20.rst | 6 +- libcxx/include/bitset | 56 ++--- libcxx/include/forward_list | 109 +++++----- libcxx/include/list | 191 +++++++++--------- .../sequences/forwardlist/types.pass.cpp | 18 ++ .../containers/sequences/list/types.pass.cpp | 18 ++ .../nonstdmem.uglified.compile.pass.cpp | 15 +- 7 files changed, 235 insertions(+), 178 deletions(-) diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index abd6764579e5..44912d2ddafa 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -78,9 +78,9 @@ Deprecations and Removals supported as an extension anymore, please migrate any code that uses e.g. ``std::vector`` to be standards conforming. -- Non-conforming member typedefs ``iterator`` and ``const_iterator`` of ``std::bitset`` are removed. Previously, they - were private but could cause ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in - LLVM 20. +- Non-conforming member typedefs ``base``, ``iterator`` and ``const_iterator`` of ``std::bitset``, and member typedef + ``base`` of ``std::forward_list`` and ``std::list`` are removed. Previously, they were private but could cause + ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in LLVM 20. - The function ``__libcpp_verbose_abort()`` is now ``noexcept``, to match ``std::terminate()``. (The combination of ``noexcept`` and ``[[noreturn]]`` has special significance for function effects analysis.) diff --git a/libcxx/include/bitset b/libcxx/include/bitset index f90ceaab816c..645c172f3be4 100644 --- a/libcxx/include/bitset +++ b/libcxx/include/bitset @@ -612,15 +612,15 @@ class _LIBCPP_TEMPLATE_VIS bitset : private __bitset<_Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1, _Size> { public: static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1; - typedef __bitset<__n_words, _Size> base; + typedef __bitset<__n_words, _Size> __base; public: - typedef typename base::reference reference; - typedef typename base::const_reference const_reference; + typedef typename __base::reference reference; + typedef typename __base::const_reference const_reference; // 23.3.5.1 constructors: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bitset() _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bitset(unsigned long long __v) _NOEXCEPT : base(__v) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bitset(unsigned long long __v) _NOEXCEPT : __base(__v) {} template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit bitset( const _CharT* __str, @@ -681,11 +681,15 @@ public: // element access: #ifdef _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { return __base::__make_ref(__p); } #else - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference operator[](size_t __p) const { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference operator[](size_t __p) const { + return __base::__make_ref(__p); + } #endif - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference operator[](size_t __p) { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference operator[](size_t __p) { + return __base::__make_ref(__p); + } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const; template @@ -726,10 +730,10 @@ private: _CharT __c = __str[__mp - 1 - __i]; (*this)[__i] = _Traits::eq(__c, __one); } - std::fill(base::__make_iter(__i), base::__make_iter(_Size), false); + std::fill(__base::__make_iter(__i), __base::__make_iter(_Size), false); } - _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT { return base::__hash_code(); } + _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT { return __base::__hash_code(); } friend struct hash; }; @@ -737,43 +741,43 @@ private: template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) _NOEXCEPT { - base::operator&=(__rhs); + __base::operator&=(__rhs); return *this; } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) _NOEXCEPT { - base::operator|=(__rhs); + __base::operator|=(__rhs); return *this; } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) _NOEXCEPT { - base::operator^=(__rhs); + __base::operator^=(__rhs); return *this; } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) _NOEXCEPT { __pos = std::min(__pos, _Size); - std::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size)); - std::fill_n(base::__make_iter(0), __pos, false); + std::copy_backward(__base::__make_iter(0), __base::__make_iter(_Size - __pos), __base::__make_iter(_Size)); + std::fill_n(__base::__make_iter(0), __pos, false); return *this; } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) _NOEXCEPT { __pos = std::min(__pos, _Size); - std::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0)); - std::fill_n(base::__make_iter(_Size - __pos), __pos, false); + std::copy(__base::__make_iter(__pos), __base::__make_iter(_Size), __base::__make_iter(0)); + std::fill_n(__base::__make_iter(_Size - __pos), __pos, false); return *this; } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::set() _NOEXCEPT { - std::fill_n(base::__make_iter(0), _Size, true); + std::fill_n(__base::__make_iter(0), _Size, true); return *this; } @@ -788,7 +792,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::reset() _NOEXCEPT { - std::fill_n(base::__make_iter(0), _Size, false); + std::fill_n(__base::__make_iter(0), _Size, false); return *this; } @@ -810,7 +814,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> bitset< template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::flip() _NOEXCEPT { - base::flip(); + __base::flip(); return *this; } @@ -819,19 +823,19 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> if (__pos >= _Size) __throw_out_of_range("bitset flip argument out of range"); - reference __r = base::__make_ref(__pos); + reference __r = __base::__make_ref(__pos); __r = ~__r; return *this; } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long bitset<_Size>::to_ulong() const { - return base::to_ulong(); + return __base::to_ulong(); } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long bitset<_Size>::to_ullong() const { - return base::to_ullong(); + return __base::to_ullong(); } template @@ -868,13 +872,13 @@ bitset<_Size>::to_string(char __zero, char __one) const { template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 size_t bitset<_Size>::count() const _NOEXCEPT { - return static_cast(std::count(base::__make_iter(0), base::__make_iter(_Size), true)); + return static_cast(std::count(__base::__make_iter(0), __base::__make_iter(_Size), true)); } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::operator==(const bitset& __rhs) const _NOEXCEPT { - return std::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0)); + return std::equal(__base::__make_iter(0), __base::__make_iter(_Size), __rhs.__make_iter(0)); } #if _LIBCPP_STD_VER <= 17 @@ -896,12 +900,12 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::test(siz template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::all() const _NOEXCEPT { - return base::all(); + return __base::all(); } template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::any() const _NOEXCEPT { - return base::any(); + return __base::any(); } template diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index d3262fb8eaed..04466d9a673f 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -640,12 +640,12 @@ void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT { template */> class _LIBCPP_TEMPLATE_VIS forward_list : private __forward_list_base<_Tp, _Alloc> { - typedef __forward_list_base<_Tp, _Alloc> base; - typedef typename base::__node_allocator __node_allocator; - typedef typename base::__node_type __node_type; - typedef typename base::__node_traits __node_traits; - typedef typename base::__node_pointer __node_pointer; - typedef typename base::__begin_node_pointer __begin_node_pointer; + typedef __forward_list_base<_Tp, _Alloc> __base; + typedef typename __base::__node_allocator __node_allocator; + typedef typename __base::__node_type __node_type; + typedef typename __base::__node_traits __node_traits; + typedef typename __base::__node_pointer __node_pointer; + typedef typename __base::__begin_node_pointer __begin_node_pointer; public: typedef _Tp value_type; @@ -666,8 +666,8 @@ public: typedef typename allocator_traits::size_type size_type; typedef typename allocator_traits::difference_type difference_type; - typedef typename base::iterator iterator; - typedef typename base::const_iterator const_iterator; + typedef typename __base::iterator iterator; + typedef typename __base::const_iterator const_iterator; #if _LIBCPP_STD_VER >= 20 typedef size_type __remove_return_type; #else @@ -684,7 +684,7 @@ public: _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : base(__a) { + _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : __base(__a) { insert_after(cbefore_begin(), __n, __v); } @@ -697,7 +697,7 @@ public: #if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> _LIBCPP_HIDE_FROM_ABI forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) - : base(__a) { + : __base(__a) { prepend_range(std::forward<_Range>(__range)); } #endif @@ -708,8 +708,8 @@ public: _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x); #ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible::value) - : base(std::move(__x)) {} + _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value) + : __base(std::move(__x)) {} _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t& __a); _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il); @@ -738,35 +738,37 @@ public: _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(base::__alloc()); } + _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(__base::__alloc()); } - _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(base::__before_begin()->__next_); } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__base::__before_begin()->__next_); } _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { - return const_iterator(base::__before_begin()->__next_); + return const_iterator(__base::__before_begin()->__next_); } _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); } _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(nullptr); } _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { - return const_iterator(base::__before_begin()->__next_); + return const_iterator(__base::__before_begin()->__next_); } _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return const_iterator(nullptr); } - _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(base::__before_begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { return const_iterator(base::__before_begin()); } + _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(__base::__before_begin()); } + _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { + return const_iterator(__base::__before_begin()); + } _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT { - return const_iterator(base::__before_begin()); + return const_iterator(__base::__before_begin()); } [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { - return base::__before_begin()->__next_ == nullptr; + return __base::__before_begin()->__next_ == nullptr; } _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { - return std::min(__node_traits::max_size(base::__alloc()), numeric_limits::max()); + return std::min(__node_traits::max_size(__base::__alloc()), numeric_limits::max()); } - _LIBCPP_HIDE_FROM_ABI reference front() { return base::__before_begin()->__next_->__get_value(); } - _LIBCPP_HIDE_FROM_ABI const_reference front() const { return base::__before_begin()->__next_->__get_value(); } + _LIBCPP_HIDE_FROM_ABI reference front() { return __base::__before_begin()->__next_->__get_value(); } + _LIBCPP_HIDE_FROM_ABI const_reference front() const { return __base::__before_begin()->__next_->__get_value(); } #ifndef _LIBCPP_CXX03_LANG # if _LIBCPP_STD_VER >= 17 @@ -823,12 +825,12 @@ public: _NOEXCEPT_(!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>) #endif { - base::swap(__x); + __base::swap(__x); } _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { base::clear(); } + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x); _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __i); @@ -899,12 +901,12 @@ forward_list(from_range_t, _Range&&, _Alloc = _Alloc()) -> forward_list -inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : base(__a) {} +inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {} template forward_list<_Tp, _Alloc>::forward_list(size_type __n) { if (__n > 0) { - for (__begin_node_pointer __p = base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { + for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); } } @@ -912,9 +914,9 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n) { #if _LIBCPP_STD_VER >= 14 template -forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : base(__base_alloc) { +forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : __base(__base_alloc) { if (__n > 0) { - for (__begin_node_pointer __p = base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { + for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); } } @@ -934,26 +936,27 @@ forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) template template ::value, int> > -forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a) : base(__a) { +forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a) + : __base(__a) { insert_after(cbefore_begin(), __f, __l); } template forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x) - : base(__node_traits::select_on_container_copy_construction(__x.__alloc())) { + : __base(__node_traits::select_on_container_copy_construction(__x.__alloc())) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x, const __type_identity_t& __a) - : base(__a) { + : __base(__a) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) { if (this != std::addressof(__x)) { - base::__copy_assign_alloc(__x); + __base::__copy_assign_alloc(__x); assign(__x.begin(), __x.end()); } return *this; @@ -962,8 +965,8 @@ forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_li #ifndef _LIBCPP_CXX03_LANG template forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identity_t& __a) - : base(std::move(__x), __a) { - if (base::__alloc() != __x.__alloc()) { + : __base(std::move(__x), __a) { + if (__base::__alloc() != __x.__alloc()) { typedef move_iterator _Ip; insert_after(cbefore_begin(), _Ip(__x.begin()), _Ip(__x.end())); } @@ -975,7 +978,7 @@ forward_list<_Tp, _Alloc>::forward_list(initializer_list __il) { } template -forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) : base(__a) { +forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) : __base(__a) { insert_after(cbefore_begin(), __il.begin(), __il.end()); } @@ -983,14 +986,14 @@ template void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable::value) { clear(); - base::__move_assign_alloc(__x); - base::__before_begin()->__next_ = __x.__before_begin()->__next_; - __x.__before_begin()->__next_ = nullptr; + __base::__move_assign_alloc(__x); + __base::__before_begin()->__next_ = __x.__before_begin()->__next_; + __x.__before_begin()->__next_ = nullptr; } template void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { - if (base::__alloc() == __x.__alloc()) + if (__base::__alloc() == __x.__alloc()) __move_assign(__x, true_type()); else { typedef move_iterator _Ip; @@ -1061,29 +1064,30 @@ typename forward_list<_Tp, _Alloc>::reference void # endif forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { - base::__before_begin()->__next_ = - this->__create_node(/* next = */ base::__before_begin()->__next_, std::forward<_Args>(__args)...); + __base::__before_begin()->__next_ = + this->__create_node(/* next = */ __base::__before_begin()->__next_, std::forward<_Args>(__args)...); # if _LIBCPP_STD_VER >= 17 - return base::__before_begin()->__next_->__get_value(); + return __base::__before_begin()->__next_->__get_value(); # endif } template void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { - base::__before_begin()->__next_ = this->__create_node(/* next = */ base::__before_begin()->__next_, std::move(__v)); + __base::__before_begin()->__next_ = + this->__create_node(/* next = */ __base::__before_begin()->__next_, std::move(__v)); } #endif // _LIBCPP_CXX03_LANG template void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) { - base::__before_begin()->__next_ = this->__create_node(/* next = */ base::__before_begin()->__next_, __v); + __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, __v); } template void forward_list<_Tp, _Alloc>::pop_front() { - __node_pointer __p = base::__before_begin()->__next_; - base::__before_begin()->__next_ = __p->__next_; + __node_pointer __p = __base::__before_begin()->__next_; + __base::__before_begin()->__next_ = __p->__next_; this->__delete_node(__p); } @@ -1380,8 +1384,9 @@ template template void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { if (this != std::addressof(__x)) { - base::__before_begin()->__next_ = __merge(base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp); - __x.__before_begin()->__next_ = nullptr; + __base::__before_begin()->__next_ = + __merge(__base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp); + __x.__before_begin()->__next_ = nullptr; } } @@ -1425,7 +1430,7 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Co template template inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) { - base::__before_begin()->__next_ = __sort(base::__before_begin()->__next_, std::distance(begin(), end()), __comp); + __base::__before_begin()->__next_ = __sort(__base::__before_begin()->__next_, std::distance(begin(), end()), __comp); } template @@ -1455,7 +1460,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co template void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { - __node_pointer __p = base::__before_begin()->__next_; + __node_pointer __p = __base::__before_begin()->__next_; if (__p != nullptr) { __node_pointer __f = __p->__next_; __p->__next_ = nullptr; @@ -1465,7 +1470,7 @@ void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { __p = __f; __f = __t; } - base::__before_begin()->__next_ = __p; + __base::__before_begin()->__next_ = __p; } } diff --git a/libcxx/include/list b/libcxx/include/list index 4a169b08d8cd..953027542712 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -665,14 +665,14 @@ void __list_imp<_Tp, _Alloc>::swap(__list_imp& __c) template */> class _LIBCPP_TEMPLATE_VIS list : private __list_imp<_Tp, _Alloc> { - typedef __list_imp<_Tp, _Alloc> base; - typedef typename base::__node_type __node_type; - typedef typename base::__node_allocator __node_allocator; - typedef typename base::__node_pointer __node_pointer; - typedef typename base::__node_alloc_traits __node_alloc_traits; - typedef typename base::__node_base __node_base; - typedef typename base::__node_base_pointer __node_base_pointer; - typedef typename base::__base_pointer __base_pointer; + typedef __list_imp<_Tp, _Alloc> __base; + typedef typename __base::__node_type __node_type; + typedef typename __base::__node_allocator __node_allocator; + typedef typename __base::__node_pointer __node_pointer; + typedef typename __base::__node_alloc_traits __node_alloc_traits; + typedef typename __base::__node_base __node_base; + typedef typename __base::__node_base_pointer __node_base_pointer; + typedef typename __base::__base_pointer __base_pointer; public: typedef _Tp value_type; @@ -682,12 +682,12 @@ public: "Allocator::value_type must be same type as value_type"); typedef value_type& reference; typedef const value_type& const_reference; - typedef typename base::pointer pointer; - typedef typename base::const_pointer const_pointer; - typedef typename base::size_type size_type; - typedef typename base::difference_type difference_type; - typedef typename base::iterator iterator; - typedef typename base::const_iterator const_iterator; + typedef typename __base::pointer pointer; + typedef typename __base::const_pointer const_pointer; + typedef typename __base::size_type size_type; + typedef typename __base::difference_type difference_type; + typedef typename __base::iterator iterator; + typedef typename __base::const_iterator const_iterator; typedef std::reverse_iterator reverse_iterator; typedef std::reverse_iterator const_reverse_iterator; #if _LIBCPP_STD_VER >= 20 @@ -697,14 +697,14 @@ public: #endif _LIBCPP_HIDE_FROM_ABI list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} - _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : base(__a) {} + _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : __base(__a) {} _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n); #if _LIBCPP_STD_VER >= 14 _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n, const allocator_type& __a); #endif _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x); template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x, const allocator_type& __a) : base(__a) { + _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x, const allocator_type& __a) : __base(__a) { for (; __n > 0; --__n) push_back(__x); } @@ -717,7 +717,8 @@ public: #if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) : base(__a) { + _LIBCPP_HIDE_FROM_ABI list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) + : __base(__a) { prepend_range(std::forward<_Range>(__range)); } #endif @@ -757,18 +758,18 @@ public: _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return base::__sz(); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return base::empty(); } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __base::__sz(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __base::empty(); } _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { - return std::min(base::__node_alloc_max_size(), numeric_limits::max()); + return std::min(__base::__node_alloc_max_size(), numeric_limits::max()); } - _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return base::begin(); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return base::begin(); } - _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return base::end(); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return base::end(); } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return base::begin(); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return base::end(); } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __base::begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __base::begin(); } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __base::end(); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __base::end(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return __base::begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __base::end(); } _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } @@ -779,19 +780,19 @@ public: _LIBCPP_HIDE_FROM_ABI reference front() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list"); - return base::__end_.__next_->__as_node()->__get_value(); + return __base::__end_.__next_->__as_node()->__get_value(); } _LIBCPP_HIDE_FROM_ABI const_reference front() const { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list"); - return base::__end_.__next_->__as_node()->__get_value(); + return __base::__end_.__next_->__as_node()->__get_value(); } _LIBCPP_HIDE_FROM_ABI reference back() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list"); - return base::__end_.__prev_->__as_node()->__get_value(); + return __base::__end_.__prev_->__as_node()->__get_value(); } _LIBCPP_HIDE_FROM_ABI const_reference back() const { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list"); - return base::__end_.__prev_->__as_node()->__get_value(); + return __base::__end_.__prev_->__as_node()->__get_value(); } #ifndef _LIBCPP_CXX03_LANG @@ -864,9 +865,9 @@ public: _NOEXCEPT_(!__node_alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>) #endif { - base::swap(__c); + __base::swap(__c); } - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { base::clear(); } + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } _LIBCPP_HIDE_FROM_ABI void pop_front(); _LIBCPP_HIDE_FROM_ABI void pop_back(); @@ -967,24 +968,24 @@ inline void list<_Tp, _Alloc>::__link_nodes(__base_pointer __p, __base_pointer _ // Link in nodes [__f, __l] at the front of the list template inline void list<_Tp, _Alloc>::__link_nodes_at_front(__base_pointer __f, __base_pointer __l) { - __f->__prev_ = base::__end_as_link(); - __l->__next_ = base::__end_.__next_; - __l->__next_->__prev_ = __l; - base::__end_.__next_ = __f; + __f->__prev_ = __base::__end_as_link(); + __l->__next_ = __base::__end_.__next_; + __l->__next_->__prev_ = __l; + __base::__end_.__next_ = __f; } // Link in nodes [__f, __l] at the back of the list template inline void list<_Tp, _Alloc>::__link_nodes_at_back(__base_pointer __f, __base_pointer __l) { - __l->__next_ = base::__end_as_link(); - __f->__prev_ = base::__end_.__prev_; - __f->__prev_->__next_ = __f; - base::__end_.__prev_ = __l; + __l->__next_ = __base::__end_as_link(); + __f->__prev_ = __base::__end_.__prev_; + __f->__prev_->__next_ = __f; + __base::__end_.__prev_ = __l; } template inline typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::__iterator(size_type __n) { - return __n <= base::__sz() / 2 ? std::next(begin(), __n) : std::prev(end(), base::__sz() - __n); + return __n <= __base::__sz() / 2 ? std::next(begin(), __n) : std::prev(end(), __base::__sz() - __n); } template @@ -999,7 +1000,7 @@ list<_Tp, _Alloc>::list(size_type __n) { #if _LIBCPP_STD_VER >= 14 template -list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : base(__a) { +list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : __base(__a) { for (; __n > 0; --__n) emplace_back(); } @@ -1020,20 +1021,20 @@ list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l) { template template ::value, int> > -list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l, const allocator_type& __a) : base(__a) { +list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l, const allocator_type& __a) : __base(__a) { for (; __f != __l; ++__f) __emplace_back(*__f); } template list<_Tp, _Alloc>::list(const list& __c) - : base(__node_alloc_traits::select_on_container_copy_construction(__c.__node_alloc())) { + : __base(__node_alloc_traits::select_on_container_copy_construction(__c.__node_alloc())) { for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i) push_back(*__i); } template -list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t& __a) : base(__a) { +list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t& __a) : __base(__a) { for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i) push_back(*__i); } @@ -1041,7 +1042,7 @@ list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t #ifndef _LIBCPP_CXX03_LANG template -list<_Tp, _Alloc>::list(initializer_list __il, const allocator_type& __a) : base(__a) { +list<_Tp, _Alloc>::list(initializer_list __il, const allocator_type& __a) : __base(__a) { for (typename initializer_list::const_iterator __i = __il.begin(), __e = __il.end(); __i != __e; ++__i) push_back(*__i); } @@ -1054,12 +1055,12 @@ list<_Tp, _Alloc>::list(initializer_list __il) { template inline list<_Tp, _Alloc>::list(list&& __c) noexcept(is_nothrow_move_constructible<__node_allocator>::value) - : base(std::move(__c.__node_alloc())) { + : __base(std::move(__c.__node_alloc())) { splice(end(), __c); } template -inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t& __a) : base(__a) { +inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t& __a) : __base(__a) { if (__a == __c.get_allocator()) splice(end(), __c); else { @@ -1078,7 +1079,7 @@ inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept( template void list<_Tp, _Alloc>::__move_assign(list& __c, false_type) { - if (base::__node_alloc() != __c.__node_alloc()) { + if (__base::__node_alloc() != __c.__node_alloc()) { typedef move_iterator _Ip; assign(_Ip(__c.begin()), _Ip(__c.end())); } else @@ -1089,7 +1090,7 @@ template void list<_Tp, _Alloc>::__move_assign(list& __c, true_type) noexcept(is_nothrow_move_assignable<__node_allocator>::value) { clear(); - base::__move_assign_alloc(__c); + __base::__move_assign_alloc(__c); splice(end(), __c); } @@ -1098,7 +1099,7 @@ void list<_Tp, _Alloc>::__move_assign(list& __c, template inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(const list& __c) { if (this != std::addressof(__c)) { - base::__copy_assign_alloc(__c); + __base::__copy_assign_alloc(__c); assign(__c.begin(), __c.end()); } return *this; @@ -1137,14 +1138,14 @@ void list<_Tp, _Alloc>::assign(size_type __n, const value_type& __x) { template inline _Alloc list<_Tp, _Alloc>::get_allocator() const _NOEXCEPT { - return allocator_type(base::__node_alloc()); + return allocator_type(__base::__node_alloc()); } template typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __p, const value_type& __x) { __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x); __link_nodes(__p.__ptr_, __node->__as_link(), __node->__as_link()); - ++base::__sz(); + ++__base::__sz(); return iterator(__node->__as_link()); } @@ -1178,7 +1179,7 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _ } #endif // _LIBCPP_HAS_EXCEPTIONS __link_nodes(__p.__ptr_, __r.__ptr_, __e.__ptr_); - base::__sz() += __ds; + __base::__sz() += __ds; } return __r; } @@ -1220,7 +1221,7 @@ list<_Tp, _Alloc>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Se } #endif // _LIBCPP_HAS_EXCEPTIONS __link_nodes(__p.__ptr_, __r.__ptr_, __e.__ptr_); - base::__sz() += __ds; + __base::__sz() += __ds; } return __r; } @@ -1230,7 +1231,7 @@ void list<_Tp, _Alloc>::push_front(const value_type& __x) { __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x); __base_pointer __nl = __node->__as_link(); __link_nodes_at_front(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); } template @@ -1238,7 +1239,7 @@ void list<_Tp, _Alloc>::push_back(const value_type& __x) { __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x); __base_pointer __nl = __node->__as_link(); __link_nodes_at_back(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); } #ifndef _LIBCPP_CXX03_LANG @@ -1248,7 +1249,7 @@ void list<_Tp, _Alloc>::push_front(value_type&& __x) { __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x)); __base_pointer __nl = __node->__as_link(); __link_nodes_at_front(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); } template @@ -1256,7 +1257,7 @@ void list<_Tp, _Alloc>::push_back(value_type&& __x) { __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x)); __base_pointer __nl = __node->__as_link(); __link_nodes_at_back(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); } template @@ -1271,7 +1272,7 @@ list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...); __base_pointer __nl = __node->__as_link(); __link_nodes_at_front(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); # if _LIBCPP_STD_VER >= 17 return __node->__get_value(); # endif @@ -1289,7 +1290,7 @@ list<_Tp, _Alloc>::emplace_back(_Args&&... __args) { this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...); __base_pointer __nl = __node->__as_link(); __link_nodes_at_back(__nl, __nl); - ++base::__sz(); + ++__base::__sz(); # if _LIBCPP_STD_VER >= 17 return __node->__get_value(); # endif @@ -1302,7 +1303,7 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::emplace(const_iterator _ this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...); __base_pointer __nl = __node->__as_link(); __link_nodes(__p.__ptr_, __nl, __nl); - ++base::__sz(); + ++__base::__sz(); return iterator(__nl); } @@ -1311,7 +1312,7 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __ __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x)); __base_pointer __nl = __node->__as_link(); __link_nodes(__p.__ptr_, __nl, __nl); - ++base::__sz(); + ++__base::__sz(); return iterator(__nl); } @@ -1320,18 +1321,18 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __ template void list<_Tp, _Alloc>::pop_front() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::pop_front() called with empty list"); - __base_pointer __n = base::__end_.__next_; - base::__unlink_nodes(__n, __n); - --base::__sz(); + __base_pointer __n = __base::__end_.__next_; + __base::__unlink_nodes(__n, __n); + --__base::__sz(); this->__delete_node(__n->__as_node()); } template void list<_Tp, _Alloc>::pop_back() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::pop_back() called on an empty list"); - __base_pointer __n = base::__end_.__prev_; - base::__unlink_nodes(__n, __n); - --base::__sz(); + __base_pointer __n = __base::__end_.__prev_; + __base::__unlink_nodes(__n, __n); + --__base::__sz(); this->__delete_node(__n->__as_node()); } @@ -1340,8 +1341,8 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __p _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p != end(), "list::erase(iterator) called with a non-dereferenceable iterator"); __base_pointer __n = __p.__ptr_; __base_pointer __r = __n->__next_; - base::__unlink_nodes(__n, __n); - --base::__sz(); + __base::__unlink_nodes(__n, __n); + --__base::__sz(); this->__delete_node(__n->__as_node()); return iterator(__r); } @@ -1349,11 +1350,11 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __p template typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l) { if (__f != __l) { - base::__unlink_nodes(__f.__ptr_, __l.__ptr_->__prev_); + __base::__unlink_nodes(__f.__ptr_, __l.__ptr_->__prev_); while (__f != __l) { __base_pointer __n = __f.__ptr_; ++__f; - --base::__sz(); + --__base::__sz(); this->__delete_node(__n->__as_node()); } } @@ -1362,10 +1363,10 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __f template void list<_Tp, _Alloc>::resize(size_type __n) { - if (__n < base::__sz()) + if (__n < __base::__sz()) erase(__iterator(__n), end()); - else if (__n > base::__sz()) { - __n -= base::__sz(); + else if (__n > __base::__sz()) { + __n -= __base::__sz(); size_type __ds = 0; __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr); ++__ds; @@ -1391,16 +1392,16 @@ void list<_Tp, _Alloc>::resize(size_type __n) { } #endif // _LIBCPP_HAS_EXCEPTIONS __link_nodes_at_back(__r.__ptr_, __e.__ptr_); - base::__sz() += __ds; + __base::__sz() += __ds; } } template void list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x) { - if (__n < base::__sz()) + if (__n < __base::__sz()) erase(__iterator(__n), end()); - else if (__n > base::__sz()) { - __n -= base::__sz(); + else if (__n > __base::__sz()) { + __n -= __base::__sz(); size_type __ds = 0; __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x); ++__ds; @@ -1426,8 +1427,8 @@ void list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x) { throw; } #endif // _LIBCPP_HAS_EXCEPTIONS - __link_nodes(base::__end_as_link(), __r.__ptr_, __e.__ptr_); - base::__sz() += __ds; + __link_nodes(__base::__end_as_link(), __r.__ptr_, __e.__ptr_); + __base::__sz() += __ds; } } @@ -1438,9 +1439,9 @@ void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c) { if (!__c.empty()) { __base_pointer __f = __c.__end_.__next_; __base_pointer __l = __c.__end_.__prev_; - base::__unlink_nodes(__f, __l); + __base::__unlink_nodes(__f, __l); __link_nodes(__p.__ptr_, __f, __l); - base::__sz() += __c.__sz(); + __base::__sz() += __c.__sz(); __c.__sz() = 0; } } @@ -1449,10 +1450,10 @@ template void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i) { if (__p.__ptr_ != __i.__ptr_ && __p.__ptr_ != __i.__ptr_->__next_) { __base_pointer __f = __i.__ptr_; - base::__unlink_nodes(__f, __f); + __base::__unlink_nodes(__f, __f); __link_nodes(__p.__ptr_, __f, __f); --__c.__sz(); - ++base::__sz(); + ++__base::__sz(); } } @@ -1465,9 +1466,9 @@ void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f if (this != std::addressof(__c)) { size_type __s = std::distance(__f, __l) + 1; __c.__sz() -= __s; - base::__sz() += __s; + __base::__sz() += __s; } - base::__unlink_nodes(__first, __last); + __base::__unlink_nodes(__first, __last); __link_nodes(__p.__ptr_, __first, __last); } } @@ -1547,12 +1548,12 @@ void list<_Tp, _Alloc>::merge(list& __c, _Comp __comp) { iterator __m2 = std::next(__f2); for (; __m2 != __e2 && __comp(*__m2, *__f1); ++__m2, (void)++__ds) ; - base::__sz() += __ds; + __base::__sz() += __ds; __c.__sz() -= __ds; __base_pointer __f = __f2.__ptr_; __base_pointer __l = __m2.__ptr_->__prev_; __f2 = __m2; - base::__unlink_nodes(__f, __l); + __base::__unlink_nodes(__f, __l); __m2 = std::next(__f1); __link_nodes(__f1.__ptr_, __f, __l); __f1 = __m2; @@ -1571,7 +1572,7 @@ inline void list<_Tp, _Alloc>::sort() { template template inline void list<_Tp, _Alloc>::sort(_Comp __comp) { - __sort(begin(), end(), base::__sz(), __comp); + __sort(begin(), end(), __base::__sz(), __comp); } template @@ -1585,7 +1586,7 @@ list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __ case 2: if (__comp(*--__e2, *__f1)) { __base_pointer __f = __e2.__ptr_; - base::__unlink_nodes(__f, __f); + __base::__unlink_nodes(__f, __f); __link_nodes(__f1.__ptr_, __f, __f); return __e2; } @@ -1603,7 +1604,7 @@ list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __ __base_pointer __l = __m2.__ptr_->__prev_; __r = __f2; __e1 = __f2 = __m2; - base::__unlink_nodes(__f, __l); + __base::__unlink_nodes(__f, __l); __m2 = std::next(__f1); __link_nodes(__f1.__ptr_, __f, __l); __f1 = __m2; @@ -1619,7 +1620,7 @@ list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __ if (__e1 == __f2) __e1 = __m2; __f2 = __m2; - base::__unlink_nodes(__f, __l); + __base::__unlink_nodes(__f, __l); __m2 = std::next(__f1); __link_nodes(__f1.__ptr_, __f, __l); __f1 = __m2; @@ -1631,7 +1632,7 @@ list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __ template void list<_Tp, _Alloc>::reverse() _NOEXCEPT { - if (base::__sz() > 1) { + if (__base::__sz() > 1) { iterator __e = end(); for (iterator __i = begin(); __i.__ptr_ != __e.__ptr_;) { std::swap(__i.__ptr_->__prev_, __i.__ptr_->__next_); diff --git a/libcxx/test/std/containers/sequences/forwardlist/types.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/types.pass.cpp index 9867bf855e8b..54766013d907 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/types.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/types.pass.cpp @@ -30,6 +30,24 @@ #include "test_macros.h" #include "min_allocator.h" +// Ensures that we don't use a non-uglified name 'base' in the implementation of 'forward_list'. + +struct my_base { + typedef my_base base; +}; + +template > +struct my_derived : my_base, std::forward_list {}; + +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +#if TEST_STD_VER >= 11 +static_assert(std::is_same>::base, my_base>::value, ""); +static_assert(std::is_same>::base, my_base>::value, ""); +static_assert(std::is_same>::base, my_base>::value, ""); +#endif + struct A { std::forward_list v; }; // incomplete type support int main(int, char**) diff --git a/libcxx/test/std/containers/sequences/list/types.pass.cpp b/libcxx/test/std/containers/sequences/list/types.pass.cpp index 8fe31e3949de..0c0a127bd76f 100644 --- a/libcxx/test/std/containers/sequences/list/types.pass.cpp +++ b/libcxx/test/std/containers/sequences/list/types.pass.cpp @@ -27,6 +27,24 @@ #include "test_macros.h" #include "min_allocator.h" +// Ensures that we don't use a non-uglified name 'base' in the implementation of 'list'. + +struct my_base { + typedef my_base base; +}; + +template > +struct my_derived : my_base, std::list {}; + +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +#if TEST_STD_VER >= 11 +static_assert(std::is_same>::base, my_base>::value, ""); +static_assert(std::is_same>::base, my_base>::value, ""); +static_assert(std::is_same>::base, my_base>::value, ""); +#endif + struct A { std::list v; }; // incomplete type support int main(int, char**) diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp index c9dd923d7130..f1daa7c3dcce 100644 --- a/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp +++ b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp @@ -8,8 +8,8 @@ // -// This test ensures that we don't use a non-uglified name 'iterator' and -// 'const_iterator' in the implementation of bitset. +// This test ensures that we don't use a non-uglified name 'iterator', +// 'const_iterator', and 'base' in the implementation of bitset. // // See https://github.com/llvm/llvm-project/issues/111125. @@ -20,6 +20,7 @@ struct my_base { typedef int* iterator; typedef const int* const_iterator; + typedef my_base base; }; template @@ -44,3 +45,13 @@ static_assert(std::is_same::const_iterator, const int*>::value, " static_assert(std::is_same::const_iterator, const int*>::value, ""); static_assert(std::is_same::const_iterator, const int*>::value, ""); static_assert(std::is_same::const_iterator, const int*>::value, ""); + +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); +static_assert(std::is_same::base, my_base>::value, ""); -- GitLab From 9f264e4d2feccb5f9b848de7455f1bda168b7633 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Oct 2024 08:39:31 -0700 Subject: [PATCH 073/511] [BOLT] Avoid repeated hash lookups (NFC) (#112822) --- bolt/lib/Passes/VeneerElimination.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp index 87fe625e8c3b..8bf0359477c6 100644 --- a/bolt/lib/Passes/VeneerElimination.cpp +++ b/bolt/lib/Passes/VeneerElimination.cpp @@ -73,12 +73,12 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) { continue; const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Instr, 0); - if (VeneerDestinations.find(TargetSymbol) == VeneerDestinations.end()) + auto It = VeneerDestinations.find(TargetSymbol); + if (It == VeneerDestinations.end()) continue; VeneerCallers++; - BC.MIB->replaceBranchTarget(Instr, VeneerDestinations[TargetSymbol], - BC.Ctx.get()); + BC.MIB->replaceBranchTarget(Instr, It->second, BC.Ctx.get()); } } } -- GitLab From a99bf0f6c98e8e2927ce7cecbb35b962285e1675 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Oct 2024 08:40:05 -0700 Subject: [PATCH 074/511] [llvm-readtapi] Simplify code with StringMap::operator[] (NFC) (#112824) --- llvm/tools/llvm-readtapi/llvm-readtapi.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp index 6e42ed76949f..1d740109d5b6 100644 --- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp +++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp @@ -325,8 +325,8 @@ static void stubifyDirectory(const StringRef InputPath, Context &Ctx) { continue; } - auto itr = SymLinks.insert({LinkTarget.c_str(), std::vector()}); - itr.first->second.emplace_back(LinkSrc.str(), std::string(SymPath.str())); + SymLinks[LinkTarget.c_str()].emplace_back(LinkSrc.str(), + std::string(SymPath.str())); continue; } -- GitLab From 721b796809eca6e67dcefe45a3498764dda3117d Mon Sep 17 00:00:00 2001 From: Mohammed Keyvanzadeh Date: Fri, 18 Oct 2024 19:12:04 +0330 Subject: [PATCH 075/511] [llvm] prefer isa_and_nonnull over v && isa (#112541) Use `isa_and_nonnull(v)` instead of `v && isa(v)`, where `v` is evaluated twice in the latter. --- llvm/lib/Analysis/ConstantFolding.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index da0fd1f07c83..74df67a4ff9b 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -82,7 +82,7 @@ static Constant *foldConstVectorToAPInt(APInt &Result, Type *DestTy, else Element = C->getAggregateElement(i); - if (Element && isa(Element)) { + if (isa_and_nonnull(Element)) { Result <<= BitShift; continue; } @@ -219,7 +219,7 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) { unsigned ShiftAmt = isLittleEndian ? 0 : SrcBitSize*(Ratio-1); for (unsigned j = 0; j != Ratio; ++j) { Constant *Src = C->getAggregateElement(SrcElt++); - if (Src && isa(Src)) + if (isa_and_nonnull(Src)) Src = Constant::getNullValue( cast(C->getType())->getElementType()); else -- GitLab From e13f1d1daf9b76134c3585e8250941920bdf3da6 Mon Sep 17 00:00:00 2001 From: knickish Date: Fri, 18 Oct 2024 10:49:26 -0500 Subject: [PATCH 076/511] [M68k] ARII atomic load/store (#108982) Only ARI was supported, this PR adds ARII support for atomic loads/stores (also with zero displacement). Closes #107939 --- llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp | 17 ++++++++- llvm/lib/Target/M68k/M68kInstrAtomics.td | 7 ++++ llvm/test/CodeGen/M68k/Atomics/non-ari.ll | 46 +++++++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/M68k/Atomics/non-ari.ll diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp index dc89fec8108c..f496085c8835 100644 --- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp +++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp @@ -772,6 +772,20 @@ static bool isAddressBase(const SDValue &N) { } } +static bool AllowARIIWithZeroDisp(SDNode *Parent) { + if (!Parent) + return false; + switch (Parent->getOpcode()) { + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + return true; + default: + return false; + } +} + bool M68kDAGToDAGISel::SelectARII(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base, SDValue &Index) { M68kISelAddressMode AM(M68kISelAddressMode::AddrType::ARII); @@ -811,8 +825,7 @@ bool M68kDAGToDAGISel::SelectARII(SDNode *Parent, SDValue N, SDValue &Disp, // The idea here is that we want to use AddrType::ARII without displacement // only if necessary like memory operations, otherwise this must be lowered // into addition - if (AM.Disp == 0 && (!Parent || (Parent->getOpcode() != ISD::LOAD && - Parent->getOpcode() != ISD::STORE))) { + if (AM.Disp == 0 && !AllowARIIWithZeroDisp(Parent)) { LLVM_DEBUG(dbgs() << "REJECT: Displacement is Zero\n"); return false; } diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td index 84a662533542..9203a3ef4ed0 100644 --- a/llvm/lib/Target/M68k/M68kInstrAtomics.td +++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td @@ -10,9 +10,16 @@ foreach size = [8, 16, 32] in { def : Pat<(!cast("atomic_load_"#size) MxCP_ARI:$ptr), (!cast("MOV"#size#"dj") !cast("MxARI"#size):$ptr)>; + def : Pat<(!cast("atomic_load_"#size) MxCP_ARII:$ptr), + (!cast("MOV"#size#"df") !cast("MxARII"#size):$ptr)>; + def : Pat<(!cast("atomic_store_"#size) !cast("MxDRD"#size):$val, MxCP_ARI:$ptr), (!cast("MOV"#size#"jd") !cast("MxARI"#size):$ptr, !cast("MxDRD"#size):$val)>; + + def : Pat<(!cast("atomic_store_"#size) !cast("MxDRD"#size):$val, MxCP_ARII:$ptr), + (!cast("MOV"#size#"fd") !cast("MxARII"#size):$ptr, + !cast("MxDRD"#size):$val)>; } let Predicates = [AtLeastM68020] in { diff --git a/llvm/test/CodeGen/M68k/Atomics/non-ari.ll b/llvm/test/CodeGen/M68k/Atomics/non-ari.ll new file mode 100644 index 000000000000..1ae545ed8722 --- /dev/null +++ b/llvm/test/CodeGen/M68k/Atomics/non-ari.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -o - -mtriple=m68k -mcpu=M68000 | FileCheck %s --check-prefix=NO-ATOMIC +; RUN: llc %s -o - -mtriple=m68k -mcpu=M68010 | FileCheck %s --check-prefix=NO-ATOMIC +; RUN: llc %s -o - -mtriple=m68k -mcpu=M68020 | FileCheck %s --check-prefix=ATOMIC +; RUN: llc %s -o - -mtriple=m68k -mcpu=M68030 | FileCheck %s --check-prefix=ATOMIC +; RUN: llc %s -o - -mtriple=m68k -mcpu=M68040 | FileCheck %s --check-prefix=ATOMIC + +define void @atomic_store_i8_element_monotonic(i8 %val, ptr %base, i32 %offset) nounwind { +; NO-ATOMIC-LABEL: atomic_store_i8_element_monotonic: +; NO-ATOMIC: ; %bb.0: +; NO-ATOMIC-NEXT: move.b (7,%sp), %d0 +; NO-ATOMIC-NEXT: move.l (12,%sp), %d1 +; NO-ATOMIC-NEXT: move.l (8,%sp), %a0 +; NO-ATOMIC-NEXT: move.b %d0, (0,%a0,%d1) +; NO-ATOMIC-NEXT: rts +; +; ATOMIC-LABEL: atomic_store_i8_element_monotonic: +; ATOMIC: ; %bb.0: +; ATOMIC-NEXT: move.b (7,%sp), %d0 +; ATOMIC-NEXT: move.l (12,%sp), %d1 +; ATOMIC-NEXT: move.l (8,%sp), %a0 +; ATOMIC-NEXT: move.b %d0, (0,%a0,%d1) +; ATOMIC-NEXT: rts + %store_pointer = getelementptr i8, ptr %base, i32 %offset + store atomic i8 %val, ptr %store_pointer monotonic, align 1 + ret void +} + +define i8 @atomic_load_i8_element_monotonic(ptr %base, i32 %offset) nounwind { +; NO-ATOMIC-LABEL: atomic_load_i8_element_monotonic: +; NO-ATOMIC: ; %bb.0: +; NO-ATOMIC-NEXT: move.l (8,%sp), %d0 +; NO-ATOMIC-NEXT: move.l (4,%sp), %a0 +; NO-ATOMIC-NEXT: move.b (0,%a0,%d0), %d0 +; NO-ATOMIC-NEXT: rts +; +; ATOMIC-LABEL: atomic_load_i8_element_monotonic: +; ATOMIC: ; %bb.0: +; ATOMIC-NEXT: move.l (8,%sp), %d0 +; ATOMIC-NEXT: move.l (4,%sp), %a0 +; ATOMIC-NEXT: move.b (0,%a0,%d0), %d0 +; ATOMIC-NEXT: rts + %load_pointer = getelementptr i8, ptr %base, i32 %offset + %return_val = load atomic i8, ptr %load_pointer monotonic, align 1 + ret i8 %return_val +} -- GitLab From 9d7b35d4e1e0c563e660450687ce475ee1959951 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 12:16:28 -0400 Subject: [PATCH 077/511] [NFC][GOFF] Fix char overflow (#112826) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warnining: llvm/unittests/Object/GOFFObjectFileTest.cpp:511:17: error: overflow in conversion from ‘int’ to ‘char’ changes value from ‘240’ to ‘'\37777777760'’ [-Werror=overflow] 511 | GOFFData[1] = 0xF0; --- llvm/unittests/Object/GOFFObjectFileTest.cpp | 112 +++++++++---------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/llvm/unittests/Object/GOFFObjectFileTest.cpp b/llvm/unittests/Object/GOFFObjectFileTest.cpp index 69f60d016a80..e2fbf81ef23f 100644 --- a/llvm/unittests/Object/GOFFObjectFileTest.cpp +++ b/llvm/unittests/Object/GOFFObjectFileTest.cpp @@ -507,72 +507,72 @@ TEST(GOFFObjectFileTest, TXTConstruct) { char GOFFData[GOFF::RecordLength * 6] = {}; // HDR record. - GOFFData[0] = 0x03; - GOFFData[1] = 0xF0; - GOFFData[50] = 0x01; + GOFFData[0] = (char)0x03; + GOFFData[1] = (char)0xF0; + GOFFData[50] = (char)0x01; // ESD record. - GOFFData[GOFF::RecordLength] = 0x03; - GOFFData[GOFF::RecordLength + 7] = 0x01; // ESDID. - GOFFData[GOFF::RecordLength + 71] = 0x05; // Size of symbol name. - GOFFData[GOFF::RecordLength + 72] = 0xa5; // Symbol name is v. - GOFFData[GOFF::RecordLength + 73] = 0x81; // Symbol name is a. - GOFFData[GOFF::RecordLength + 74] = 0x99; // Symbol name is r. - GOFFData[GOFF::RecordLength + 75] = 0x7b; // Symbol name is #. - GOFFData[GOFF::RecordLength + 76] = 0x83; // Symbol name is c. + GOFFData[GOFF::RecordLength] = (char)0x03; + GOFFData[GOFF::RecordLength + 7] = (char)0x01; // ESDID. + GOFFData[GOFF::RecordLength + 71] = (char)0x05; // Size of symbol name. + GOFFData[GOFF::RecordLength + 72] = (char)0xa5; // Symbol name is v. + GOFFData[GOFF::RecordLength + 73] = (char)0x81; // Symbol name is a. + GOFFData[GOFF::RecordLength + 74] = (char)0x99; // Symbol name is r. + GOFFData[GOFF::RecordLength + 75] = (char)0x7b; // Symbol name is #. + GOFFData[GOFF::RecordLength + 76] = (char)0x83; // Symbol name is c. // ESD record. - GOFFData[GOFF::RecordLength * 2] = 0x03; - GOFFData[GOFF::RecordLength * 2 + 3] = 0x01; - GOFFData[GOFF::RecordLength * 2 + 7] = 0x02; // ESDID. - GOFFData[GOFF::RecordLength * 2 + 11] = 0x01; // Parent ESDID. - GOFFData[GOFF::RecordLength * 2 + 27] = 0x08; // Length. - GOFFData[GOFF::RecordLength * 2 + 40] = 0x01; // Name Space ID. - GOFFData[GOFF::RecordLength * 2 + 41] = 0x80; - GOFFData[GOFF::RecordLength * 2 + 60] = 0x04; // Size of symbol name. - GOFFData[GOFF::RecordLength * 2 + 61] = 0x04; // Size of symbol name. - GOFFData[GOFF::RecordLength * 2 + 63] = 0x0a; // Size of symbol name. - GOFFData[GOFF::RecordLength * 2 + 66] = 0x03; // Size of symbol name. - GOFFData[GOFF::RecordLength * 2 + 71] = 0x08; // Size of symbol name. - GOFFData[GOFF::RecordLength * 2 + 72] = 0xc3; // Symbol name is c. - GOFFData[GOFF::RecordLength * 2 + 73] = 0x6d; // Symbol name is _. - GOFFData[GOFF::RecordLength * 2 + 74] = 0xc3; // Symbol name is c. - GOFFData[GOFF::RecordLength * 2 + 75] = 0xd6; // Symbol name is o. - GOFFData[GOFF::RecordLength * 2 + 76] = 0xc4; // Symbol name is D. - GOFFData[GOFF::RecordLength * 2 + 77] = 0xc5; // Symbol name is E. - GOFFData[GOFF::RecordLength * 2 + 78] = 0xf6; // Symbol name is 6. - GOFFData[GOFF::RecordLength * 2 + 79] = 0xf4; // Symbol name is 4. + GOFFData[GOFF::RecordLength * 2] = (char)0x03; + GOFFData[GOFF::RecordLength * 2 + 3] = (char)0x01; + GOFFData[GOFF::RecordLength * 2 + 7] = (char)0x02; // ESDID. + GOFFData[GOFF::RecordLength * 2 + 11] = (char)0x01; // Parent ESDID. + GOFFData[GOFF::RecordLength * 2 + 27] = (char)0x08; // Length. + GOFFData[GOFF::RecordLength * 2 + 40] = (char)0x01; // Name Space ID. + GOFFData[GOFF::RecordLength * 2 + 41] = (char)0x80; + GOFFData[GOFF::RecordLength * 2 + 60] = (char)0x04; // Size of symbol name. + GOFFData[GOFF::RecordLength * 2 + 61] = (char)0x04; // Size of symbol name. + GOFFData[GOFF::RecordLength * 2 + 63] = (char)0x0a; // Size of symbol name. + GOFFData[GOFF::RecordLength * 2 + 66] = (char)0x03; // Size of symbol name. + GOFFData[GOFF::RecordLength * 2 + 71] = (char)0x08; // Size of symbol name. + GOFFData[GOFF::RecordLength * 2 + 72] = (char)0xc3; // Symbol name is c. + GOFFData[GOFF::RecordLength * 2 + 73] = (char)0x6d; // Symbol name is _. + GOFFData[GOFF::RecordLength * 2 + 74] = (char)0xc3; // Symbol name is c. + GOFFData[GOFF::RecordLength * 2 + 75] = (char)0xd6; // Symbol name is o. + GOFFData[GOFF::RecordLength * 2 + 76] = (char)0xc4; // Symbol name is D. + GOFFData[GOFF::RecordLength * 2 + 77] = (char)0xc5; // Symbol name is E. + GOFFData[GOFF::RecordLength * 2 + 78] = (char)0xf6; // Symbol name is 6. + GOFFData[GOFF::RecordLength * 2 + 79] = (char)0xf4; // Symbol name is 4. // ESD record. - GOFFData[GOFF::RecordLength * 3] = 0x03; - GOFFData[GOFF::RecordLength * 3 + 3] = 0x02; - GOFFData[GOFF::RecordLength * 3 + 7] = 0x03; // ESDID. - GOFFData[GOFF::RecordLength * 3 + 11] = 0x02; // Parent ESDID. - GOFFData[GOFF::RecordLength * 3 + 71] = 0x05; // Size of symbol name. - GOFFData[GOFF::RecordLength * 3 + 72] = 0xa5; // Symbol name is v. - GOFFData[GOFF::RecordLength * 3 + 73] = 0x81; // Symbol name is a. - GOFFData[GOFF::RecordLength * 3 + 74] = 0x99; // Symbol name is r. - GOFFData[GOFF::RecordLength * 3 + 75] = 0x7b; // Symbol name is #. - GOFFData[GOFF::RecordLength * 3 + 76] = 0x83; // Symbol name is c. + GOFFData[GOFF::RecordLength * 3] = (char)0x03; + GOFFData[GOFF::RecordLength * 3 + 3] = (char)0x02; + GOFFData[GOFF::RecordLength * 3 + 7] = (char)0x03; // ESDID. + GOFFData[GOFF::RecordLength * 3 + 11] = (char)0x02; // Parent ESDID. + GOFFData[GOFF::RecordLength * 3 + 71] = (char)0x05; // Size of symbol name. + GOFFData[GOFF::RecordLength * 3 + 72] = (char)0xa5; // Symbol name is v. + GOFFData[GOFF::RecordLength * 3 + 73] = (char)0x81; // Symbol name is a. + GOFFData[GOFF::RecordLength * 3 + 74] = (char)0x99; // Symbol name is r. + GOFFData[GOFF::RecordLength * 3 + 75] = (char)0x7b; // Symbol name is #. + GOFFData[GOFF::RecordLength * 3 + 76] = (char)0x83; // Symbol name is c. // TXT record. - GOFFData[GOFF::RecordLength * 4] = 0x03; - GOFFData[GOFF::RecordLength * 4 + 1] = 0x10; - GOFFData[GOFF::RecordLength * 4 + 7] = 0x02; - GOFFData[GOFF::RecordLength * 4 + 23] = 0x08; // Data Length. - GOFFData[GOFF::RecordLength * 4 + 24] = 0x12; - GOFFData[GOFF::RecordLength * 4 + 25] = 0x34; - GOFFData[GOFF::RecordLength * 4 + 26] = 0x56; - GOFFData[GOFF::RecordLength * 4 + 27] = 0x78; - GOFFData[GOFF::RecordLength * 4 + 28] = 0x9a; - GOFFData[GOFF::RecordLength * 4 + 29] = 0xbc; - GOFFData[GOFF::RecordLength * 4 + 30] = 0xde; - GOFFData[GOFF::RecordLength * 4 + 31] = 0xf0; + GOFFData[GOFF::RecordLength * 4] = (char)0x03; + GOFFData[GOFF::RecordLength * 4 + 1] = (char)0x10; + GOFFData[GOFF::RecordLength * 4 + 7] = (char)0x02; + GOFFData[GOFF::RecordLength * 4 + 23] = (char)0x08; // Data Length. + GOFFData[GOFF::RecordLength * 4 + 24] = (char)0x12; + GOFFData[GOFF::RecordLength * 4 + 25] = (char)0x34; + GOFFData[GOFF::RecordLength * 4 + 26] = (char)0x56; + GOFFData[GOFF::RecordLength * 4 + 27] = (char)0x78; + GOFFData[GOFF::RecordLength * 4 + 28] = (char)0x9a; + GOFFData[GOFF::RecordLength * 4 + 29] = (char)0xbc; + GOFFData[GOFF::RecordLength * 4 + 30] = (char)0xde; + GOFFData[GOFF::RecordLength * 4 + 31] = (char)0xf0; // END record. - GOFFData[GOFF::RecordLength * 5] = 0x03; - GOFFData[GOFF::RecordLength * 5 + 1] = 0x40; - GOFFData[GOFF::RecordLength * 5 + 11] = 0x06; + GOFFData[GOFF::RecordLength * 5] = (char)0x03; + GOFFData[GOFF::RecordLength * 5 + 1] = (char)0x40; + GOFFData[GOFF::RecordLength * 5 + 11] = (char)0x06; StringRef Data(GOFFData, GOFF::RecordLength * 6); -- GitLab From 0f3ed9c6505f5727712876c18ad71dba6271bc50 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 18 Oct 2024 17:39:21 +0100 Subject: [PATCH 078/511] [ARM] Use ARM::NoRegister in more places. NFC Similar to #112507, this uses ARM::NoRegister in a few more places, as opposed to the constant 0. --- llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 +- llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 54eb0118d778..906519fef45d 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -2550,7 +2550,7 @@ public: addVPTPredNOperands(Inst, N-1); MCRegister RegNum; if (getVPTPred() == ARMVCC::None) { - RegNum = MCRegister(); + RegNum = ARM::NoRegister; } else { unsigned NextOpIndex = Inst.getNumOperands(); auto &MCID = Parser->getInstrDesc(Inst.getOpcode()); diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 814b71d17319..38280adf2757 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -993,7 +993,7 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const { CCI = MI.insert(CCI, MCOperand::createImm(CC)); ++CCI; if (CC == ARMCC::AL) - MI.insert(CCI, MCOperand::createReg(0)); + MI.insert(CCI, MCOperand::createReg(ARM::NoRegister)); else MI.insert(CCI, MCOperand::createReg(ARM::CPSR)); } else if (CC != ARMCC::AL) { @@ -1060,7 +1060,7 @@ void ARMDisassembler::UpdateThumbVFPPredicate( I->setImm(CC); ++I; if (CC == ARMCC::AL) - I->setReg(0); + I->setReg(ARM::NoRegister); else I->setReg(ARM::CPSR); return; @@ -1648,7 +1648,7 @@ static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, Check(S, MCDisassembler::SoftFail); Inst.addOperand(MCOperand::createImm(Val)); if (Val == ARMCC::AL) { - Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); } else Inst.addOperand(MCOperand::createReg(ARM::CPSR)); return S; @@ -1660,7 +1660,7 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, if (Val) Inst.addOperand(MCOperand::createReg(ARM::CPSR)); else - Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); return MCDisassembler::Success; } -- GitLab From 3a91611f3bcd46b0b7352d6cb997c999d58facef Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 16:58:00 +0100 Subject: [PATCH 079/511] [X86] Ensure the AVX1-only broadcast-load patterns are in the same place. NFC. --- llvm/lib/Target/X86/X86InstrSSE.td | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index e77e56aa96c6..036d7d92f3f8 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7741,12 +7741,14 @@ let Predicates = [HasAVX2, NoVLX] in { // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), - (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), - (VBROADCASTSSrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), + (VBROADCASTSSYrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), + (VBROADCASTSDYrm addr:$src)>; + def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), + (VBROADCASTSSrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), + (VMOVDDUPrm addr:$src)>; } // Provide fallback in case the load node that is used in the patterns above @@ -7795,9 +7797,6 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v2i64 (X86VBroadcast i64:$src)), (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; - def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), - (VMOVDDUPrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast v2i64:$src)), (VINSERTF128rri (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), (v2i64 (VPSHUFDri VR128:$src, 0x44)), sub_xmm), -- GitLab From 7da0a698526ff657c2348a6e4bb835fc764177da Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Oct 2024 17:35:51 +0100 Subject: [PATCH 080/511] [X86] andnot-patterns.ll - add non-BMI test coverage Extra test coverage for #112547 to test cases where we don't create a ANDNOT instruction --- llvm/test/CodeGen/X86/andnot-patterns.ll | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll index 46ebe6ba7656..101e4ed008f7 100644 --- a/llvm/test/CodeGen/X86/andnot-patterns.ll +++ b/llvm/test/CodeGen/X86/andnot-patterns.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s --check-prefixes=X86,X86-NOBMI +; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86,X86-BMI +; RUN: llc < %s -mtriple=x86_64-- -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI +; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI ; TODO - PR112425 - attempt to reconstruct andnot patterns through bitwise-agnostic operations @@ -624,3 +626,8 @@ define i8 @andnot_bitreverse_i8(i8 %a0, i8 %a1) nounwind { %and = and i8 %bitrev, %a0 ret i8 %and } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-BMI: {{.*}} +; X64-NOBMI: {{.*}} +; X86-BMI: {{.*}} +; X86-NOBMI: {{.*}} -- GitLab From 8182f8176ec0eb9f96ad50280c759ef6e2ca2d60 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 12:51:29 -0400 Subject: [PATCH 081/511] [NFC] Fix c++ style comment in c file (#112814) This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warnings: llvm-project/clang/include/clang-c/Index.h:2983:3: error: C++ style comments are not allowed in ISO C90 [-Werror] 2983 | // HLSL Types --- clang/include/clang-c/Index.h | 2 +- llvm/include/llvm/Support/AutoConvert.h | 34 +++++++++++++------------ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index 4f99bf4ebe30..0c5ac80772e2 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2980,7 +2980,7 @@ enum CXTypeKind { CXType_Atomic = 177, CXType_BTFTagAttributed = 178, - // HLSL Types + /* HLSL Types */ CXType_HLSLResource = 179, CXType_HLSLAttributedResource = 180 }; diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 6f45c4683f77..65ac576ae567 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -1,4 +1,4 @@ -//===- AutoConvert.h - Auto conversion between ASCII/EBCDIC -----*- C++ -*-===// +/*===- AutoConvert.h - Auto conversion between ASCII/EBCDIC -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -9,7 +9,7 @@ // This file contains functions used for auto conversion between // ASCII/EBCDIC codepages specific to z/OS. // -//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===*/ #ifndef LLVM_SUPPORT_AUTOCONVERT_H #define LLVM_SUPPORT_AUTOCONVERT_H @@ -18,7 +18,7 @@ #include <_Ccsid.h> #ifdef __cplusplus #include -#endif // __cplusplus +#endif /* __cplusplus */ #define CCSID_IBM_1047 1047 #define CCSID_UTF_8 1208 @@ -26,35 +26,37 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /* __cplusplus */ int enablezOSAutoConversion(int FD); int disablezOSAutoConversion(int FD); int restorezOSStdHandleAutoConversion(int FD); #ifdef __cplusplus } -#endif // __cplusplus +#endif /* __cplusplus */ #ifdef __cplusplus namespace llvm { -/// \brief Disable the z/OS enhanced ASCII auto-conversion for the file -/// descriptor. +/** \brief Disable the z/OS enhanced ASCII auto-conversion for the file + * descriptor. + */ std::error_code disablezOSAutoConversion(int FD); -/// \brief Query the z/OS enhanced ASCII auto-conversion status of a file -/// descriptor and force the conversion if the file is not tagged with a -/// codepage. +/** \brief Query the z/OS enhanced ASCII auto-conversion status of a file + * descriptor and force the conversion if the file is not tagged with a + * codepage. + */ std::error_code enablezOSAutoConversion(int FD); -/// Restore the z/OS enhanced ASCII auto-conversion for the std handle. +/** Restore the z/OS enhanced ASCII auto-conversion for the std handle. */ std::error_code restorezOSStdHandleAutoConversion(int FD); -/// \brief Set the tag information for a file descriptor. +/** \brief Set the tag information for a file descriptor. */ std::error_code setzOSFileTag(int FD, int CCSID, bool Text); -} // namespace llvm -#endif // __cplusplus +} /* namespace llvm */ +#endif /* __cplusplus */ -#endif // __MVS__ +#endif /* __MVS__ */ -#endif // LLVM_SUPPORT_AUTOCONVERT_H +#endif /* LLVM_SUPPORT_AUTOCONVERT_H */ -- GitLab From 6a623e8484fe713d7074acb7ceab520458bfa89c Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 12:53:59 -0400 Subject: [PATCH 082/511] [NFC] add DiagnosticsTestHelper decl (#112820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warning llvm-project/clang/unittests/Basic/DiagnosticTest.cpp:19:6: error: ‘void clang::DiagnosticsTestHelper(clang::DiagnosticsEngine&)’ has not been declared within ‘clang’ [-Werror] 19 | void clang::DiagnosticsTestHelper(DiagnosticsEngine &diag) { | ^~~~~ In file included from llvm-project/clang/unittests/Basic/DiagnosticTest.cpp:9: llvm-project/clang/include/clang/Basic/Diagnostic.h:567:15: note: only here as a ‘friend’ 567 | friend void DiagnosticsTestHelper(DiagnosticsEngine &); --- clang/unittests/Basic/DiagnosticTest.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/clang/unittests/Basic/DiagnosticTest.cpp b/clang/unittests/Basic/DiagnosticTest.cpp index 691d74f697f2..d8d23e3b6700 100644 --- a/clang/unittests/Basic/DiagnosticTest.cpp +++ b/clang/unittests/Basic/DiagnosticTest.cpp @@ -16,6 +16,11 @@ using namespace llvm; using namespace clang; +// Declare DiagnosticsTestHelper to avoid GCC warning +namespace clang { +void DiagnosticsTestHelper(DiagnosticsEngine &diag); +} + void clang::DiagnosticsTestHelper(DiagnosticsEngine &diag) { EXPECT_FALSE(diag.DiagStates.empty()); EXPECT_TRUE(diag.DiagStatesByLoc.empty()); -- GitLab From a24a420c2b4854598ac24a571a8275bfaa1b9159 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 18 Oct 2024 10:02:16 -0700 Subject: [PATCH 083/511] [SandboxIR] Implement FPMathOperator (#112921) This patch implements sandboxir::FPMathOperator mirroring llvm::FPMathOperator --- llvm/include/llvm/SandboxIR/Operator.h | 39 +++++++++++++++++ llvm/include/llvm/SandboxIR/Type.h | 6 ++- llvm/include/llvm/SandboxIR/Value.h | 2 + llvm/unittests/SandboxIR/OperatorTest.cpp | 53 +++++++++++++++++++++++ 4 files changed, 98 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Operator.h b/llvm/include/llvm/SandboxIR/Operator.h index 95c450807191..f19c54c75e42 100644 --- a/llvm/include/llvm/SandboxIR/Operator.h +++ b/llvm/include/llvm/SandboxIR/Operator.h @@ -55,6 +55,45 @@ public: return llvm::OverflowingBinaryOperator::classof(From->Val); } }; + +class FPMathOperator : public Operator { +public: + bool isFast() const { return cast(Val)->isFast(); } + bool hasAllowReassoc() const { + return cast(Val)->hasAllowReassoc(); + } + bool hasNoNaNs() const { + return cast(Val)->hasNoNaNs(); + } + bool hasNoInfs() const { + return cast(Val)->hasNoInfs(); + } + bool hasNoSignedZeros() const { + return cast(Val)->hasNoSignedZeros(); + } + bool hasAllowReciprocal() const { + return cast(Val)->hasAllowReciprocal(); + } + bool hasAllowContract() const { + return cast(Val)->hasAllowContract(); + } + bool hasApproxFunc() const { + return cast(Val)->hasApproxFunc(); + } + FastMathFlags getFastMathFlags() const { + return cast(Val)->getFastMathFlags(); + } + float getFPAccuracy() const { + return cast(Val)->getFPAccuracy(); + } + static bool isSupportedFloatingPointType(Type *Ty) { + return llvm::FPMathOperator::isSupportedFloatingPointType(Ty->LLVMTy); + } + static bool classof(const Value *V) { + return llvm::FPMathOperator::classof(V->Val); + } +}; + } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_OPERATOR_H diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 8094f66567fb..9d1db11edb05 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -33,12 +33,13 @@ class ArrayType; class StructType; class TargetExtType; class Module; +class FPMathOperator; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; #include "llvm/SandboxIR/Values.def" -/// Just like llvm::Type these are immutable, unique, never get freed and can -/// only be created via static factory methods. +/// Just like llvm::Type these are immutable, unique, never get freed and +/// can only be created via static factory methods. class Type { protected: llvm::Type *LLVMTy; @@ -61,6 +62,7 @@ protected: friend class Utils; // for LLVMTy friend class TargetExtType; // For LLVMTy. friend class Module; // For LLVMTy. + friend class FPMathOperator; // For LLVMTy. // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 58088684bf18..243195f4c1c4 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -30,6 +30,7 @@ class CmpInst; class IntrinsicInst; class Operator; class OverflowingBinaryOperator; +class FPMathOperator; /// Iterator for the `Use` edges of a Value's users. /// \Returns a `Use` when dereferenced. @@ -162,6 +163,7 @@ protected: friend class IntrinsicInst; // For `Val`. friend class Operator; // For `Val`. friend class OverflowingBinaryOperator; // For `Val`. + friend class FPMathOperator; // For `Val`. // Region needs to manipulate metadata in the underlying LLVM Value, we don't // expose metadata in sandboxir. friend class Region; diff --git a/llvm/unittests/SandboxIR/OperatorTest.cpp b/llvm/unittests/SandboxIR/OperatorTest.cpp index 031e2adf4069..b1e324417da4 100644 --- a/llvm/unittests/SandboxIR/OperatorTest.cpp +++ b/llvm/unittests/SandboxIR/OperatorTest.cpp @@ -86,3 +86,56 @@ define void @foo(i8 %v1) { EXPECT_EQ(AddNUW->getNoWrapKind(), llvm::OverflowingBinaryOperator::NoUnsignedWrap); } + +TEST_F(OperatorTest, FPMathOperator) { + parseIR(C, R"IR( +define void @foo(float %v1, double %v2) { + %fadd = fadd float %v1, 42.0 + %Fast = fadd fast float %v1, 42.0 + %Reassoc = fmul reassoc float %v1, 42.0 + %NNAN = fmul nnan float %v1, 42.0 + %NINF = fmul ninf float %v1, 42.0 + %NSZ = fmul nsz float %v1, 42.0 + %ARCP = fmul arcp float %v1, 42.0 + %CONTRACT = fmul contract float %v1, 42.0 + %AFN = fmul afn double %v2, 42.0 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + auto *LLVMBB = &*LLVMF->begin(); + auto LLVMIt = LLVMBB->begin(); + + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto TermIt = BB->getTerminator()->getIterator(); + while (It != TermIt) { + auto *FPM = cast(&*It++); + auto *LLVMFPM = cast(&*LLVMIt++); + EXPECT_EQ(FPM->isFast(), LLVMFPM->isFast()); + EXPECT_EQ(FPM->hasAllowReassoc(), LLVMFPM->hasAllowReassoc()); + EXPECT_EQ(FPM->hasNoNaNs(), LLVMFPM->hasNoNaNs()); + EXPECT_EQ(FPM->hasNoInfs(), LLVMFPM->hasNoInfs()); + EXPECT_EQ(FPM->hasNoSignedZeros(), LLVMFPM->hasNoSignedZeros()); + EXPECT_EQ(FPM->hasAllowReciprocal(), LLVMFPM->hasAllowReciprocal()); + EXPECT_EQ(FPM->hasAllowContract(), LLVMFPM->hasAllowContract()); + EXPECT_EQ(FPM->hasApproxFunc(), LLVMFPM->hasApproxFunc()); + + // There doesn't seem to be an operator== for FastMathFlags so let's do a + // string comparison instead. + std::string Str1; + raw_string_ostream SS1(Str1); + std::string Str2; + raw_string_ostream SS2(Str2); + FPM->getFastMathFlags().print(SS1); + LLVMFPM->getFastMathFlags().print(SS2); + EXPECT_EQ(Str1, Str2); + + EXPECT_EQ(FPM->getFPAccuracy(), LLVMFPM->getFPAccuracy()); + EXPECT_EQ( + sandboxir::FPMathOperator::isSupportedFloatingPointType(FPM->getType()), + llvm::FPMathOperator::isSupportedFloatingPointType(LLVMFPM->getType())); + } +} -- GitLab From 6c60ead15a8932b30823a89b6686f7cee240f751 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Fri, 18 Oct 2024 13:07:37 -0400 Subject: [PATCH 084/511] [NFC] Fix Werror=extra warning related to mismatched enum type (#112808) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warnings: llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp: In member function ‘void llvm::AsmPrinter::emitJumpTableSizesSection(const llvm::MachineJumpTableInfo*, const llvm::Function&) const’: llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp:2852:31: error: enumerated and non-enumerated type in conditional expression [-Werror=extra] 2852 | int Flags = F.hasComdat() ? ELF::SHF_GROUP : 0; | ~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~ --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 3a8cde7330ef..327e7f7f8a1e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2849,7 +2849,7 @@ void AsmPrinter::emitJumpTableSizesSection(const MachineJumpTableInfo *MJTI, if (isElf) { MCSymbolELF *LinkedToSym = dyn_cast(CurrentFnSym); - int Flags = F.hasComdat() ? ELF::SHF_GROUP : 0; + int Flags = F.hasComdat() ? (unsigned)ELF::SHF_GROUP : 0; JumpTableSizesSection = OutContext.getELFSection( sectionName, ELF::SHT_LLVM_JT_SIZES, Flags, 0, GroupName, F.hasComdat(), -- GitLab From 6264288d70610c40256f96f003e14ab5e8890fb8 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 18 Oct 2024 10:12:23 -0700 Subject: [PATCH 085/511] [MemProf] Fix the option to disable memprof ICP (#112917) The -enable-memprof-indirect-call-support meant to guard the recently added memprof ICP support was not used in enough places. Specifically, it was not checked in mayHaveMemprofSummary, which is called from the ThinLTO backend applyImports. This led to failures when checking the callsite records, as we incorrectly expected records for indirect calls. Fix the option to be checked in all necessary locations, and add testing. --- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 10 ++++- llvm/test/ThinLTO/X86/memprof-icp.ll | 45 +++++++++++++++++++-- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 1bd9ee651d2b..0f4e85f5123f 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -503,6 +503,10 @@ static void computeFunctionSummary( if (!IsThinLTO) continue; + // Skip indirect calls if we haven't enabled memprof ICP. + if (!CalledFunction && !EnableMemProfIndirectCallSupport) + continue; + // Ensure we keep this analysis in sync with the handling in the ThinLTO // backend (see MemProfContextDisambiguation::applyImport). Save this call // so that we can skip it in checking the reverse case later. @@ -561,7 +565,8 @@ static void computeFunctionSummary( auto CalleeValueInfo = Index.getOrInsertValueInfo(cast(CalledValue)); Callsites.push_back({CalleeValueInfo, StackIdIndices}); - } else if (EnableMemProfIndirectCallSupport) { + } else { + assert(EnableMemProfIndirectCallSupport); // For indirect callsites, create multiple Callsites, one per target. // This enables having a different set of clone versions per target, // and we will apply the cloning decisions while speculatively @@ -1223,6 +1228,9 @@ bool llvm::mayHaveMemprofSummary(const CallBase *CB) { if (CI && CalledFunction->isIntrinsic()) return false; } else { + // Skip indirect calls if we haven't enabled memprof ICP. + if (!EnableMemProfIndirectCallSupport) + return false; // Skip inline assembly calls. if (CI && CI->isInlineAsm()) return false; diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll index 5c6d4e383d32..e19c56b90e62 100644 --- a/llvm/test/ThinLTO/X86/memprof-icp.ll +++ b/llvm/test/ThinLTO/X86/memprof-icp.ll @@ -76,17 +76,18 @@ ;; for each profiled target in the VP metadata. They will have the same stackIds ;; since the debug information for the callsite is the same. ; RUN: llvm-dis %t/foo.o -o - | FileCheck %s --check-prefix=CALLSITES -; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235))) +; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)) ;; Make sure that we don't get the synthesized callsite records if the ;; -enable-memprof-indirect-call-support flag is false. -; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false -o - \ -; RUN: | llvm-dis -o - | FileCheck %s --implicit-check-not callsites +; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false >%t/foo.noicp.o +; RUN: llvm-dis %t/foo.noicp.o -o - | FileCheck %s --implicit-check-not "stackIds: (16345663650247127235)" ;; First perform in-process ThinLTO ; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ ; RUN: -r=%t/main.o,_Z3fooR2B0j, \ ; RUN: -r=%t/main.o,_Znwm, \ ; RUN: -r=%t/main.o,_ZdlPvm, \ @@ -116,6 +117,7 @@ ; RUN: -supports-hot-cold-new \ ; RUN: -thinlto-distributed-indexes \ ; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ ; RUN: -r=%t/main.o,_Z3fooR2B0j, \ ; RUN: -r=%t/main.o,_Znwm, \ ; RUN: -r=%t/main.o,_ZdlPvm, \ @@ -141,6 +143,36 @@ ; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \ ; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO +;; Retry with the ICP-disabled object file, and make sure we disable it again +;; so we don't look for the synthesized callsite records when applying imports. +;; We should not get any cloning. +; RUN: llvm-lto2 run %t/main.o %t/foo.noicp.o -enable-memprof-context-disambiguation \ +; RUN: -enable-memprof-indirect-call-support=false \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t/foo.noicp.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.noicp.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/main.o,_Z3fooR2B0j, \ +; RUN: -r=%t/main.o,_Znwm, \ +; RUN: -r=%t/main.o,_ZdlPvm, \ +; RUN: -r=%t/main.o,_Z8externalPi, \ +; RUN: -r=%t/main.o,main,plx \ +; RUN: -r=%t/main.o,_ZN2B03barEj,plx \ +; RUN: -r=%t/main.o,_ZN1B3barEj,plx \ +; RUN: -r=%t/main.o,_ZTV1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS2B0,plx \ +; RUN: -r=%t/main.o,_ZTI2B0,plx \ +; RUN: -r=%t/main.o,_ZTI1B,plx \ +; RUN: -r=%t/main.o,_ZTV2B0,plx \ +; RUN: -thinlto-threads=1 \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=. -save-temps \ +; RUN: -o %t.noicp.out 2>&1 | FileCheck %s --implicit-check-not "created clone" + +; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof" + ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 ; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1 @@ -215,15 +247,22 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +declare i32 @_Z3xyzR2B0j(ptr %b) + define i32 @_Z3fooR2B0j(ptr %b) { entry: %0 = load ptr, ptr %b, align 8 %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1 + ;; Add a dummy call to ensure that we have some callsite metadata, + ;; which triggers callsite record checking in the ThinLTO backend + ;; even with -enable-memprof-indirect-call-support=false. + %call2 = call i32 @_Z3xyzR2B0j(ptr null, i32 0), !callsite !2 ret i32 0 } !0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2} !1 = !{i64 -2101080423462424381} +!2 = !{i64 1234} ;--- main.ll target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" -- GitLab From 9120adea504981dfd55ace25825f84018543d6f2 Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Fri, 18 Oct 2024 10:19:48 -0700 Subject: [PATCH 086/511] Fix build break in SemaHLSL.cpp on MSVC 2022: warning C4715: 'getResourceClass': not all control paths return a value (#112767) Moves the existing `llvm_unreachable` statement to the bottom of the function and changes the case statement to deliberately fall through to it. Build break was introduced by #111203 It was not caught by the builders as they use Visual Studio 2019, whereas this warning only appears in 2022. --------- Co-authored-by: Matheus Izvekov --- clang/lib/Sema/SemaHLSL.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 1d18a6308e2a..c6627b0e9932 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -102,8 +102,10 @@ static ResourceClass getResourceClass(RegisterType RT) { return ResourceClass::Sampler; case RegisterType::C: case RegisterType::I: - llvm_unreachable("unexpected RegisterType value"); + // Deliberately falling through to the unreachable below. + break; } + llvm_unreachable("unexpected RegisterType value"); } DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD, -- GitLab From a01d7df09048e0b0b002c3f8420bcc8c7eab3ea0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Oct 2024 10:40:13 -0700 Subject: [PATCH 087/511] [lldb] Avoid repeated map lookups (NFC) (#112823) --- lldb/source/Commands/CommandObjectMultiword.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lldb/source/Commands/CommandObjectMultiword.cpp b/lldb/source/Commands/CommandObjectMultiword.cpp index 484d9022027e..b4cdfea9b1a3 100644 --- a/lldb/source/Commands/CommandObjectMultiword.cpp +++ b/lldb/source/Commands/CommandObjectMultiword.cpp @@ -102,11 +102,9 @@ llvm::Error CommandObjectMultiword::LoadUserSubcommand( std::string str_name(name); - auto pos = m_subcommand_dict.find(str_name); - if (pos == m_subcommand_dict.end()) { - m_subcommand_dict[str_name] = cmd_obj_sp; + auto [pos, inserted] = m_subcommand_dict.try_emplace(str_name, cmd_obj_sp); + if (inserted) return llvm::Error::success(); - } const char *error_str = nullptr; if (!can_replace) @@ -117,7 +115,7 @@ llvm::Error CommandObjectMultiword::LoadUserSubcommand( if (error_str) { return llvm::createStringError(llvm::inconvertibleErrorCode(), error_str); } - m_subcommand_dict[str_name] = cmd_obj_sp; + pos->second = cmd_obj_sp; return llvm::Error::success(); } -- GitLab From 5995e4b97b593d156b05a729008dd1bc2604d91a Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 18 Oct 2024 10:40:27 -0700 Subject: [PATCH 088/511] [MemProf] Disable memprof ICP support by default (#112940) A failure showed up after this was committed, rather than revert simply disable this new support to simplify investigation and further testing. --- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 2 +- llvm/test/ThinLTO/X86/memprof-icp.ll | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 0f4e85f5123f..004e8b76a3c8 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -82,7 +82,7 @@ static cl::opt ModuleSummaryDotFile( cl::desc("File to emit dot graph of new summary into")); static cl::opt EnableMemProfIndirectCallSupport( - "enable-memprof-indirect-call-support", cl::init(true), cl::Hidden, + "enable-memprof-indirect-call-support", cl::init(false), cl::Hidden, cl::desc( "Enable MemProf support for summarizing and cloning indirect calls")); diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll index e19c56b90e62..2e976794425b 100644 --- a/llvm/test/ThinLTO/X86/memprof-icp.ll +++ b/llvm/test/ThinLTO/X86/memprof-icp.ll @@ -69,8 +69,9 @@ ; RUN: split-file %s %t -; RUN: opt -thinlto-bc %t/main.ll >%t/main.o -; RUN: opt -thinlto-bc %t/foo.ll >%t/foo.o +;; For now explicitly turn on this handling, which is off by default. +; RUN: opt -thinlto-bc %t/main.ll -enable-memprof-indirect-call-support=true >%t/main.o +; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=true >%t/foo.o ;; Check that we get the synthesized callsite records. There should be 2, one ;; for each profiled target in the VP metadata. They will have the same stackIds @@ -82,9 +83,12 @@ ;; -enable-memprof-indirect-call-support flag is false. ; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false >%t/foo.noicp.o ; RUN: llvm-dis %t/foo.noicp.o -o - | FileCheck %s --implicit-check-not "stackIds: (16345663650247127235)" +;; Currently this should be off by default as well. +; RUN: opt -thinlto-bc %t/foo.ll -o - | llvm-dis -o - | FileCheck %s --implicit-check-not "stackIds: (16345663650247127235)" ;; First perform in-process ThinLTO ; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ +; RUN: -enable-memprof-indirect-call-support=true \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ ; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ @@ -138,6 +142,7 @@ ;; Run ThinLTO backend ; RUN: opt -import-all-index -passes=function-import,memprof-context-disambiguation,inline \ +; RUN: -enable-memprof-indirect-call-support=true \ ; RUN: -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \ ; RUN: -enable-import-metadata -stats -pass-remarks=. \ ; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \ -- GitLab From f9d3e98207c8b5cd86d245050569eaf38809045d Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:03:16 -0700 Subject: [PATCH 089/511] [lld-macho] Improve robustness of ObjC category merging (#112618) This patch enhances the robustness of lld's Objective-C category merging. Currently, the category merger assumes it can fully parse and understand the format of all categories in the input, triggering an assert if any invalid category data is encountered. This will end up causing asserts in certain rare corner cases that are difficult to reproduce in small test cases. The proposed changes modify the behavior so that if invalid category data is detected, category merging is skipped for that specific class and all other categories sharing the same base class. This approach allows the linker to continue processing other categories without failing entirely due to a single problematic input. We also add a LIT test to where we corrupt category data and check that category merging for that class was skipped but the link was successful. --- lld/MachO/ObjC.cpp | 99 +++++++++++++------ .../MachO/objc-category-merging-minimal.s | 13 +++ 2 files changed, 80 insertions(+), 32 deletions(-) diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp index b9f7592fa9c6..ff13e8eb4b5c 100644 --- a/lld/MachO/ObjC.cpp +++ b/lld/MachO/ObjC.cpp @@ -423,7 +423,7 @@ public: private: DenseSet collectNlCategories(); void collectAndValidateCategoriesData(); - void + bool mergeCategoriesIntoSingleCategory(std::vector &categories); void eraseISec(ConcatInputSection *isec); @@ -434,8 +434,8 @@ private: catListToErasedOffsets); void collectSectionWriteInfoFromIsec(const InputSection *isec, InfoWriteSection &catWriteInfo); - void collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo); - void parseCatInfoToExtInfo(const InfoInputCategory &catInfo, + bool collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo); + bool parseCatInfoToExtInfo(const InfoInputCategory &catInfo, ClassExtensionInfo &extInfo); void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset, @@ -446,7 +446,7 @@ private: uint32_t secOffset, SourceLanguage sourceLang); - void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset, + bool parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset, PointerListInfo &ptrList); void emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset, @@ -474,7 +474,7 @@ private: uint32_t offset); Defined *getClassRo(const Defined *classSym, bool getMetaRo); SourceLanguage getClassSymSourceLang(const Defined *classSym); - void mergeCategoriesIntoBaseClass(const Defined *baseClass, + bool mergeCategoriesIntoBaseClass(const Defined *baseClass, std::vector &categories); void eraseSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset); void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec, @@ -543,9 +543,9 @@ ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec, if (!reloc) return nullptr; - Symbol *sym = reloc->referent.get(); + Symbol *sym = reloc->referent.dyn_cast(); - if (reloc->addend) { + if (reloc->addend && sym) { assert(isa(sym) && "Expected defined for non-zero addend"); Defined *definedSym = cast(sym); sym = tryFindDefinedOnIsec(definedSym->isec(), @@ -618,7 +618,7 @@ void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset( } } -void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory( +bool ObjcCategoryMerger::collectCategoryWriterInfoFromCategory( const InfoInputCategory &catInfo) { if (!infoCategoryWriter.catListInfo.valid) @@ -631,7 +631,14 @@ void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory( if (!infoCategoryWriter.catNameInfo.valid) { lld::macho::Defined *catNameSym = tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset); - assert(catNameSym && "Category does not have a valid name Symbol"); + + if (!catNameSym) { + // This is an unhandeled case where the category name is not a symbol but + // instead points to an CStringInputSection (that doesn't have any symbol) + // TODO: Find a small repro and either fix or add a test case for this + // scenario + return false; + } collectSectionWriteInfoFromIsec(catNameSym->isec(), infoCategoryWriter.catNameInfo); @@ -651,6 +658,8 @@ void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory( } } } + + return true; } // Parse a protocol list that might be linked to ConcatInputSection at a given @@ -723,7 +732,7 @@ ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec, // Parse a pointer list that might be linked to ConcatInputSection at a given // offset. This can be used for instance methods, class methods, instance props // and class props since they have the same format. -void ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec, +bool ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset, PointerListInfo &ptrList) { assert(ptrList.pointersPerStruct == 2 || ptrList.pointersPerStruct == 3); @@ -732,8 +741,9 @@ void ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec, "Trying to read pointer list beyond section end"); const Reloc *reloc = isec->getRelocAt(secOffset); + // Empty list is a valid case, return true. if (!reloc) - return; + return true; auto *ptrListSym = dyn_cast_or_null(reloc->referent.get()); assert(ptrListSym && "Reloc does not have a valid Defined"); @@ -759,17 +769,24 @@ void ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec, const Reloc *reloc = ptrListSym->isec()->getRelocAt(off); assert(reloc && "No reloc found at pointer list offset"); - auto *listSym = dyn_cast_or_null(reloc->referent.get()); - assert(listSym && "Reloc does not have a valid Defined"); + auto *listSym = + dyn_cast_or_null(reloc->referent.dyn_cast()); + // Sometimes, the reloc points to a StringPiece (InputSection + addend) + // instead of a symbol. + // TODO: Skip these cases for now, but we should fix this. + if (!listSym) + return false; ptrList.allPtrs.push_back(listSym); } + + return true; } // Here we parse all the information of an input category (catInfo) and // append the parsed info into the structure which will contain all the // information about how a class is extended (extInfo) -void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo, +bool ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo, ClassExtensionInfo &extInfo) { const Reloc *catNameReloc = catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset); @@ -808,20 +825,27 @@ void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo, "class"); } - parsePointerListInfo(catInfo.catBodyIsec, catLayout.instanceMethodsOffset, - extInfo.instanceMethods); + if (!parsePointerListInfo(catInfo.catBodyIsec, + catLayout.instanceMethodsOffset, + extInfo.instanceMethods)) + return false; - parsePointerListInfo(catInfo.catBodyIsec, catLayout.classMethodsOffset, - extInfo.classMethods); + if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.classMethodsOffset, + extInfo.classMethods)) + return false; parseProtocolListInfo(catInfo.catBodyIsec, catLayout.protocolsOffset, extInfo.protocols, catInfo.sourceLanguage); - parsePointerListInfo(catInfo.catBodyIsec, catLayout.instancePropsOffset, - extInfo.instanceProps); + if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.instancePropsOffset, + extInfo.instanceProps)) + return false; - parsePointerListInfo(catInfo.catBodyIsec, catLayout.classPropsOffset, - extInfo.classProps); + if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.classPropsOffset, + extInfo.classProps)) + return false; + + return true; } // Generate a protocol list (including header) and link it into the parent at @@ -1090,14 +1114,15 @@ Defined *ObjcCategoryMerger::emitCategory(const ClassExtensionInfo &extInfo) { // This method merges all the categories (sharing a base class) into a single // category. -void ObjcCategoryMerger::mergeCategoriesIntoSingleCategory( +bool ObjcCategoryMerger::mergeCategoriesIntoSingleCategory( std::vector &categories) { assert(categories.size() > 1 && "Expected at least 2 categories"); ClassExtensionInfo extInfo(catLayout); for (auto &catInfo : categories) - parseCatInfoToExtInfo(catInfo, extInfo); + if (!parseCatInfoToExtInfo(catInfo, extInfo)) + return false; Defined *newCatDef = emitCategory(extInfo); assert(newCatDef && "Failed to create a new category"); @@ -1107,6 +1132,8 @@ void ObjcCategoryMerger::mergeCategoriesIntoSingleCategory( for (auto &catInfo : categories) catInfo.wasMerged = true; + + return true; } void ObjcCategoryMerger::createSymbolReference(Defined *refFrom, @@ -1179,9 +1206,10 @@ void ObjcCategoryMerger::collectAndValidateCategoriesData() { tryGetSymbolAtIsecOffset(catBodyIsec, catLayout.klassOffset); assert(classSym && "Category does not have a valid base class"); - categoryMap[classSym].push_back(catInputInfo); + if (!collectCategoryWriterInfoFromCategory(catInputInfo)) + continue; - collectCategoryWriterInfoFromCategory(catInputInfo); + categoryMap[classSym].push_back(catInputInfo); } } } @@ -1309,13 +1337,17 @@ void ObjcCategoryMerger::doMerge() { collectAndValidateCategoriesData(); for (auto &[baseClass, catInfos] : categoryMap) { + bool merged = false; if (auto *baseClassDef = dyn_cast(baseClass)) { // Merge all categories into the base class - mergeCategoriesIntoBaseClass(baseClassDef, catInfos); + merged = mergeCategoriesIntoBaseClass(baseClassDef, catInfos); } else if (catInfos.size() > 1) { // Merge all categories into a new, single category - mergeCategoriesIntoSingleCategory(catInfos); + merged = mergeCategoriesIntoSingleCategory(catInfos); } + if (!merged) + warn("ObjC category merging skipped for class symbol' " + + baseClass->getName().str() + "'\n"); } // Erase all categories that were merged @@ -1374,7 +1406,8 @@ ObjcCategoryMerger::getClassSymSourceLang(const Defined *classSym) { llvm_unreachable("Unexpected class symbol name during category merging"); } -void ObjcCategoryMerger::mergeCategoriesIntoBaseClass( + +bool ObjcCategoryMerger::mergeCategoriesIntoBaseClass( const Defined *baseClass, std::vector &categories) { assert(categories.size() >= 1 && "Expected at least one category to merge"); @@ -1383,9 +1416,9 @@ void ObjcCategoryMerger::mergeCategoriesIntoBaseClass( extInfo.baseClass = baseClass; extInfo.baseClassSourceLanguage = getClassSymSourceLang(baseClass); - for (auto &catInfo : categories) { - parseCatInfoToExtInfo(catInfo, extInfo); - } + for (auto &catInfo : categories) + if (!parseCatInfoToExtInfo(catInfo, extInfo)) + return false; // Get metadata for the base class Defined *metaRo = getClassRo(baseClass, /*getMetaRo=*/true); @@ -1452,6 +1485,8 @@ void ObjcCategoryMerger::mergeCategoriesIntoBaseClass( // Mark all the categories as merged - this will be used to erase them later for (auto &catInfo : categories) catInfo.wasMerged = true; + + return true; } // Erase the symbol at a given offset in an InputSection diff --git a/lld/test/MachO/objc-category-merging-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s index 088a4d0f3041..0fc785a4a9e4 100644 --- a/lld/test/MachO/objc-category-merging-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -28,6 +28,19 @@ # RUN: %lld -no_objc_relative_method_lists -arch arm64 -dylib -o merge_base_class_swift_minimal_yes_merge.dylib -objc_category_merging MyBaseClassSwiftExtension.o merge_base_class_minimal.o # RUN: llvm-objdump --objc-meta-data --macho merge_base_class_swift_minimal_yes_merge.dylib | FileCheck %s --check-prefixes=YES_MERGE_INTO_BASE_SWIFT +############ Test merging skipped due to invalid category name ############ +# Modify __OBJC_$_CATEGORY_MyBaseClass_$_Category01's name to point to L_OBJC_IMAGE_INFO+3 +# RUN: sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/ }' merge_cat_minimal.s > merge_cat_minimal_bad_name.s + +# Assemble the modified source +# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal_bad_name.o merge_cat_minimal_bad_name.s + +# Run lld and check for the specific warning +# RUN: %no-fatal-warnings-lld -arch arm64 -dylib -objc_category_merging -o merge_cat_minimal_merge.dylib a64_fakedylib.dylib merge_cat_minimal_bad_name.o 2>&1 | FileCheck %s --check-prefix=MERGE_WARNING + +# Check that lld emitted the warning about skipping category merging +MERGE_WARNING: warning: ObjC category merging skipped for class symbol' _OBJC_CLASS_$_MyBaseClass' + #### Check merge categories enabled ### # Check that the original categories are not there MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01 -- GitLab From 7e87c2ae5d7e8a855746467442a1da9a3c6bf2fa Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 18 Oct 2024 19:05:57 +0100 Subject: [PATCH 090/511] [AArch64] Add some qshrn test cases. NFC --- llvm/test/CodeGen/AArch64/qshrn.ll | 383 +++++++++++++++++++++++++++++ 1 file changed, 383 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/qshrn.ll diff --git a/llvm/test/CodeGen/AArch64/qshrn.ll b/llvm/test/CodeGen/AArch64/qshrn.ll new file mode 100644 index 000000000000..eaba88da7b09 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/qshrn.ll @@ -0,0 +1,383 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -mtriple=aarch64 -o - | FileCheck %s + +define <4 x i16> @NarrowAShrI32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrU32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrU32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #5 +; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrI32By5ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By5ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrI32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrI32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = lshr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrU32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrU32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #5 +; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = lshr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrI32By5ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrI32By5ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %s = lshr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + + +define <2 x i32> @NarrowAShri64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShri64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = ashr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowAShrU64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShrU64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = ashr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowAShri64By5ToU32(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShri64By5ToU32: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: ret + %s = ashr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShri64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShri64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = lshr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShrU64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShrU64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2d, v0.2d, #5 +; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = lshr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShri64By5ToU32(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShri64By5ToU32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: ret + %s = lshr <2 x i64> %x, + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + + +define <8 x i8> @NarrowAShri16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShri16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = ashr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowAShrU16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShrU16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.8h, v0.8h, #5 +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = ashr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowAShri16By5ToU8(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShri16By5ToU8: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: ret + %s = ashr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShri16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShri16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = lshr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShrU16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShrU16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #5 +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = lshr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShri16By5ToU8(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShri16By5ToU8: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: ret + %s = lshr <8 x i16> %x, + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + + + + + +define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By31: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #16 +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By31ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: sshr v0.4s, v0.4s, #16 +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %s = ashr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrU32By31: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 +; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = lshr <4 x i32> %x, + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + + +define <16 x i8> @signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { +; CHECK-LABEL: signed_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sshr v1.8h, v1.8h, #5 +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: sqxtn2 v0.16b, v1.8h +; CHECK-NEXT: ret +entry: + %s = ashr <16 x i16> %x, + %min = call <16 x i16> @llvm.smin.v8i16(<16 x i16> %s, <16 x i16> ) + %max = call <16 x i16> @llvm.smax.v8i16(<16 x i16> %min, <16 x i16> ) + %trunc = trunc <16 x i16> %max to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @unsigned_minmax_v8i16_to_v16i8(<16 x i16> %x) { +; CHECK-LABEL: unsigned_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushr v0.8h, v0.8h, #5 +; CHECK-NEXT: ushr v1.8h, v1.8h, #5 +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: uqxtn2 v0.16b, v1.8h +; CHECK-NEXT: ret +entry: + %s = lshr <16 x i16> %x, + %min = call <16 x i16> @llvm.umin.v8i16(<16 x i16> %s, <16 x i16> ) + %trunc = trunc <16 x i16> %min to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @unsigned_signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { +; CHECK-LABEL: unsigned_signed_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sshr v1.8h, v1.8h, #5 +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: sqxtun2 v0.16b, v1.8h +; CHECK-NEXT: ret +entry: + %s = ashr <16 x i16> %x, + %max = call <16 x i16> @llvm.smax.v8i16(<16 x i16> %s, <16 x i16> ) + %min = call <16 x i16> @llvm.umin.v8i16(<16 x i16> %max, <16 x i16> ) + %trunc = trunc <16 x i16> %min to <16 x i8> + ret <16 x i8> %trunc +} + + +define <8 x i16> @signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { +; CHECK-LABEL: signed_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sshr v1.4s, v1.4s, #5 +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: sqxtn2 v0.8h, v1.4s +; CHECK-NEXT: ret +entry: + %s = ashr <8 x i32> %x, + %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %s, <8 x i32> ) + %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %min, <8 x i32> ) + %trunc = trunc <8 x i32> %max to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @unsigned_minmax_v4i32_to_v8i16(<8 x i32> %x) { +; CHECK-LABEL: unsigned_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushr v0.4s, v0.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #5 +; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: uqxtn2 v0.8h, v1.4s +; CHECK-NEXT: ret +entry: + %s = lshr <8 x i32> %x, + %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %s, <8 x i32> ) + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @unsigned_signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { +; CHECK-LABEL: unsigned_signed_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sshr v1.4s, v1.4s, #5 +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: sqxtun2 v0.8h, v1.4s +; CHECK-NEXT: ret +entry: + %s = ashr <8 x i32> %x, + %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %s, <8 x i32> ) + %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %max, <8 x i32> ) + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + + +define <4 x i32> @signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { +; CHECK-LABEL: signed_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sshr v1.2d, v1.2d, #5 +; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: sqxtn2 v0.4s, v1.2d +; CHECK-NEXT: ret +entry: + %s = ashr <4 x i64> %x, + %min = call <4 x i64> @llvm.smin.v8i64(<4 x i64> %s, <4 x i64> ) + %max = call <4 x i64> @llvm.smax.v8i64(<4 x i64> %min, <4 x i64> ) + %trunc = trunc <4 x i64> %max to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @unsigned_minmax_v4i64_to_v8i32(<4 x i64> %x) { +; CHECK-LABEL: unsigned_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushr v0.2d, v0.2d, #5 +; CHECK-NEXT: ushr v1.2d, v1.2d, #5 +; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: uqxtn2 v0.4s, v1.2d +; CHECK-NEXT: ret +entry: + %s = lshr <4 x i64> %x, + %min = call <4 x i64> @llvm.umin.v8i64(<4 x i64> %s, <4 x i64> ) + %trunc = trunc <4 x i64> %min to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @unsigned_signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { +; CHECK-LABEL: unsigned_signed_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sshr v1.2d, v1.2d, #5 +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: sqxtun2 v0.4s, v1.2d +; CHECK-NEXT: ret +entry: + %s = ashr <4 x i64> %x, + %max = call <4 x i64> @llvm.smax.v8i64(<4 x i64> %s, <4 x i64> ) + %min = call <4 x i64> @llvm.umin.v8i64(<4 x i64> %max, <4 x i64> ) + %trunc = trunc <4 x i64> %min to <4 x i32> + ret <4 x i32> %trunc +} -- GitLab From 266154a59b957daa7ec976dea70cc75e78ca71b6 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 18 Oct 2024 11:15:02 -0700 Subject: [PATCH 091/511] [ADT] Make concat able to handle ranges with iterators that return by value (such as zip) (#112783) If any iterator in the concatenation returns by value, the result must return by value otherwise it'll produce dangling references. (some context that may or may not be relevant to this part of the code may be in https://github.com/llvm/llvm-project/commit/981ce8fa15afa11d083033240edb1daff29081c7 ) An alternative to #112441 --- llvm/include/llvm/ADT/STLExtras.h | 29 ++++++++++++++++------ llvm/unittests/ADT/STLExtrasTest.cpp | 37 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index eb441bb31c9b..43c9b80edff7 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1023,6 +1023,16 @@ class concat_iterator std::forward_iterator_tag, ValueT> { using BaseT = typename concat_iterator::iterator_facade_base; + static constexpr bool ReturnsByValue = + !(std::is_reference_v())> && ...); + + using reference_type = + typename std::conditional_t; + + using handle_type = + typename std::conditional_t, + ValueT *>; + /// We store both the current and end iterators for each concatenated /// sequence in a tuple of pairs. /// @@ -1065,27 +1075,30 @@ class concat_iterator /// Returns null if the specified iterator is at the end. Otherwise, /// dereferences the iterator and returns the address of the resulting /// reference. - template ValueT *getHelper() const { + template handle_type getHelper() const { auto &Begin = std::get(Begins); auto &End = std::get(Ends); if (Begin == End) - return nullptr; + return {}; - return &*Begin; + if constexpr (ReturnsByValue) + return *Begin; + else + return &*Begin; } /// Finds the first non-end iterator, dereferences, and returns the resulting /// reference. /// /// It is an error to call this with all iterators at the end. - template ValueT &get(std::index_sequence) const { + template reference_type get(std::index_sequence) const { // Build a sequence of functions to get from iterator if possible. - ValueT *(concat_iterator::*GetHelperFns[])() const = { - &concat_iterator::getHelper...}; + handle_type (concat_iterator::*GetHelperFns[])() + const = {&concat_iterator::getHelper...}; // Loop over them, and return the first result we find. for (auto &GetHelperFn : GetHelperFns) - if (ValueT *P = (this->*GetHelperFn)()) + if (auto P = (this->*GetHelperFn)()) return *P; llvm_unreachable("Attempted to get a pointer from an end concat iterator!"); @@ -1107,7 +1120,7 @@ public: return *this; } - ValueT &operator*() const { + reference_type operator*() const { return get(std::index_sequence_for()); } diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index ee8299c9b486..406ff2bc1607 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -504,6 +504,43 @@ TEST(STLExtrasTest, ConcatRange) { EXPECT_EQ(Expected, Test); } +template struct Iterator { + int i = 0; + T operator*() const { return i; } + Iterator &operator++() { + ++i; + return *this; + } + bool operator==(Iterator RHS) const { return i == RHS.i; } +}; + +template struct RangeWithValueType { + int i; + RangeWithValueType(int i) : i(i) {} + Iterator begin() { return Iterator{0}; } + Iterator end() { return Iterator{i}; } +}; + +TEST(STLExtrasTest, ValueReturn) { + RangeWithValueType R(1); + auto C = concat(R, R); + auto I = C.begin(); + ASSERT_NE(I, C.end()); + static_assert(std::is_same_v); + auto V = *I; + ASSERT_EQ(V, 0); +} + +TEST(STLExtrasTest, ReferenceReturn) { + RangeWithValueType R(1); + auto C = concat(R, R); + auto I = C.begin(); + ASSERT_NE(I, C.end()); + static_assert(std::is_same_v); + auto V = *I; + ASSERT_EQ(V, 0); +} + TEST(STLExtrasTest, PartitionAdaptor) { std::vector V = {1, 2, 3, 4, 5, 6, 7, 8}; -- GitLab From 629a182282c5e3dad31e8af5f651f91a4fff1b6a Mon Sep 17 00:00:00 2001 From: HighW4y2H3ll Date: Fri, 18 Oct 2024 11:16:57 -0700 Subject: [PATCH 092/511] Full path names are used in several unittests instead of the binary name. Fix up the testcase failures (#107974) Encountered several testcase failures when running `ninja check-all`. It was due to the full path name were shown in the error message instead of the binary name, and therefore causing the check string mismatch. The machine was running CentOS 9 with binfmt_misc setup that uses qemu-aarch64 (8.1.2). Built and ran the unittest as aarch64 host (through qemu user). Co-authored-by: h2h --- clang-tools-extra/clangd/test/log.test | 2 +- clang-tools-extra/test/clang-query/invalid-command-line.cpp | 2 +- .../test/clang-tidy/infrastructure/invalid-command-line.cpp | 2 +- llvm/utils/lit/tests/shtest-output-printing.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clangd/test/log.test b/clang-tools-extra/clangd/test/log.test index 7a53d361ddde..5cc871972f98 100644 --- a/clang-tools-extra/clangd/test/log.test +++ b/clang-tools-extra/clangd/test/log.test @@ -1,7 +1,7 @@ # RUN: env CLANGD_FLAGS=-compile-commands-dir=no-such-dir not clangd -lit-test &1 >/dev/null | FileCheck %s CHECK: I[{{.*}}]{{.*}} clangd version {{.*}} CHECK: Working directory: {{.*}} -CHECK: argv[0]: clangd +CHECK: argv[0]: {{.*}}clangd CHECK: argv[1]: -lit-test CHECK: CLANGD_FLAGS: -compile-commands-dir=no-such-dir CHECK: E[{{.*}}] Path specified by --compile-commands-dir does not exist. diff --git a/clang-tools-extra/test/clang-query/invalid-command-line.cpp b/clang-tools-extra/test/clang-query/invalid-command-line.cpp index e3e8af1d5e7a..a66acc8037f7 100644 --- a/clang-tools-extra/test/clang-query/invalid-command-line.cpp +++ b/clang-tools-extra/test/clang-query/invalid-command-line.cpp @@ -1,4 +1,4 @@ // RUN: not clang-query --invalid-arg 2>&1 | FileCheck %s -// CHECK: error: clang-query{{(\.exe)?}}: Unknown command line argument '--invalid-arg'. Try: 'clang-query{{(\.exe)?}} --help' +// CHECK: error: clang-query{{(\.exe)?}}: Unknown command line argument '--invalid-arg'. Try: '{{.*}}clang-query{{(\.exe)?}} --help' // CHECK-NEXT: clang-query{{(\.exe)?}}: Did you mean '--extra-arg'? diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp index c06b09d90004..4bdca50af32c 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp @@ -1,4 +1,4 @@ // RUN: not clang-tidy --invalid-arg 2>&1 | FileCheck %s -// CHECK: error: clang-tidy{{(\.exe)?}}: Unknown command line argument '--invalid-arg'. Try: 'clang-tidy{{(\.exe)?}} --help' +// CHECK: error: clang-tidy{{(\.exe)?}}: Unknown command line argument '--invalid-arg'. Try: '{{.*}}clang-tidy{{(\.exe)?}} --help' // CHECK-NEXT: clang-tidy{{(\.exe)?}}: Did you mean '--extra-arg'? diff --git a/llvm/utils/lit/tests/shtest-output-printing.py b/llvm/utils/lit/tests/shtest-output-printing.py index 129cff981eb5..b9045c3fe520 100644 --- a/llvm/utils/lit/tests/shtest-output-printing.py +++ b/llvm/utils/lit/tests/shtest-output-printing.py @@ -25,7 +25,7 @@ # CHECK-NEXT: not not wc missing-file &> [[FILE:.*]] || true # CHECK-NEXT: # executed command: not not wc missing-file # CHECK-NEXT: # .---redirected output from '[[FILE]]' -# CHECK-NEXT: # | wc: {{cannot open missing-file|missing-file.* No such file or directory}} +# CHECK-NEXT: # | {{.*}}wc: {{cannot open missing-file|missing-file.* No such file or directory}} # CHECK-NEXT: # `----------------------------- # CHECK-NEXT: # note: command had no output on stdout or stderr # CHECK-NEXT: # error: command failed with exit status: 1 -- GitLab From 170dab9972df3f6e905502db1846bb05fb444ec4 Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Oct 2024 14:26:05 -0400 Subject: [PATCH 093/511] [libc][math] Fix signed zeros for powf when underflow happens. (#112601) --- libc/src/math/generic/powf.cpp | 6 +++--- libc/test/src/math/smoke/powf_test.cpp | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp index 8ce2465ba229..83477c6ef2ac 100644 --- a/libc/src/math/generic/powf.cpp +++ b/libc/src/math/generic/powf.cpp @@ -855,9 +855,9 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { : 0.0; exp2_hi_mid_dd.hi = exp2_hi_mid; - return static_cast( - powf_double_double(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd)) + - 0.0f; + double r_dd = powf_double_double(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd); + + return static_cast(r_dd); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp index bd4f98e30fbd..a0f66f2733a1 100644 --- a/libc/test/src/math/smoke/powf_test.cpp +++ b/libc/test/src/math/smoke/powf_test.cpp @@ -190,4 +190,7 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { FE_UNDERFLOW); } } + + EXPECT_FP_EQ(-0.0f, LIBC_NAMESPACE::powf(-0.015625f, 25.0f)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::powf(-0.015625f, 26.0f)); } -- GitLab From e3b22dcedb53386d7ed4db0e013365ebfe67571c Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Fri, 18 Oct 2024 11:36:19 -0700 Subject: [PATCH 094/511] [clang][RISCV] Extend intrinsic size check variable from 16 -> 32 bits. NFC (#111481) We currently have over 67000 intrinsics, uint16_t will overflow. --- clang/lib/Sema/SemaRISCV.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index d1ccc2774152..e63d605349e0 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -50,7 +50,7 @@ struct RVVIntrinsicDef { struct RVVOverloadIntrinsicDef { // Indexes of RISCVIntrinsicManagerImpl::IntrinsicList. - SmallVector Indexes; + SmallVector Indexes; }; } // namespace @@ -169,7 +169,7 @@ private: // List of all RVV intrinsic. std::vector IntrinsicList; // Mapping function name to index of IntrinsicList. - StringMap Intrinsics; + StringMap Intrinsics; // Mapping function name to RVVOverloadIntrinsicDef. StringMap OverloadIntrinsics; @@ -399,7 +399,7 @@ void RISCVIntrinsicManagerImpl::InitRVVIntrinsic( Record.HasFRMRoundModeOp); // Put into IntrinsicList. - uint16_t Index = IntrinsicList.size(); + uint32_t Index = IntrinsicList.size(); assert(IntrinsicList.size() == (size_t)Index && "Intrinsics indices overflow."); IntrinsicList.push_back({BuiltinName, Signature}); -- GitLab From d60fdc1ca31f21e27450f3902710ab37907af84e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 11:42:13 -0700 Subject: [PATCH 095/511] [nfc][lsan] Parametrize ScanForPointers with loader (#112803) Use `DirectLoader` which is equivalent to existing behaviour of loading pointers directly from memory. --- compiler-rt/lib/lsan/lsan_common.cpp | 70 +++++++++++++++++++--------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 721db7872cce..9aed36b96ce9 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -288,23 +288,33 @@ static inline bool MaybeUserPointer(uptr p) { # endif } +namespace { +struct DirectMemoryAccessor { + void Init(uptr begin, uptr end) {}; + void *LoadPtr(uptr p) const { return *reinterpret_cast(p); } +}; +} // namespace + // Scans the memory range, looking for byte patterns that point into allocator // chunks. Marks those chunks with |tag| and adds them to |frontier|. // There are two usage modes for this function: finding reachable chunks // (|tag| = kReachable) and finding indirectly leaked chunks // (|tag| = kIndirectlyLeaked). In the second case, there's no flood fill, // so |frontier| = 0. -void ScanRangeForPointers(uptr begin, uptr end, Frontier *frontier, - const char *region_type, ChunkTag tag) { +template +void ScanForPointers(uptr begin, uptr end, Frontier *frontier, + const char *region_type, ChunkTag tag, + Accessor &accessor) { CHECK(tag == kReachable || tag == kIndirectlyLeaked); const uptr alignment = flags()->pointer_alignment(); LOG_POINTERS("Scanning %s range %p-%p.\n", region_type, (void *)begin, (void *)end); + accessor.Init(begin, end); uptr pp = begin; if (pp % alignment) pp = pp + alignment - pp % alignment; for (; pp + sizeof(void *) <= end; pp += alignment) { - void *p = *reinterpret_cast(pp); + void *p = accessor.LoadPtr(pp); # if SANITIZER_APPLE p = TransformPointer(p); # endif @@ -339,6 +349,12 @@ void ScanRangeForPointers(uptr begin, uptr end, Frontier *frontier, } } +void ScanRangeForPointers(uptr begin, uptr end, Frontier *frontier, + const char *region_type, ChunkTag tag) { + DirectMemoryAccessor accessor; + ScanForPointers(begin, end, frontier, region_type, tag, accessor); +} + // Scans a global range for pointers void ScanGlobalRange(uptr begin, uptr end, Frontier *frontier) { uptr allocator_begin = 0, allocator_end = 0; @@ -356,14 +372,21 @@ void ScanGlobalRange(uptr begin, uptr end, Frontier *frontier) { } } -void ScanExtraStackRanges(const InternalMmapVector &ranges, - Frontier *frontier) { +template +void ScanExtraStack(const InternalMmapVector &ranges, Frontier *frontier, + Accessor &accessor) { for (uptr i = 0; i < ranges.size(); i++) { - ScanRangeForPointers(ranges[i].begin, ranges[i].end, frontier, "FAKE STACK", - kReachable); + ScanForPointers(ranges[i].begin, ranges[i].end, frontier, "FAKE STACK", + kReachable, accessor); } } +void ScanExtraStackRanges(const InternalMmapVector &ranges, + Frontier *frontier) { + DirectMemoryAccessor accessor; + ScanExtraStack(ranges, frontier, accessor); +} + # if SANITIZER_FUCHSIA // Fuchsia handles all threads together with its own callback. @@ -399,10 +422,11 @@ static void ProcessThreadRegistry(Frontier *frontier) { } // Scans thread data (stacks and TLS) for heap pointers. +template static void ProcessThread(tid_t os_id, uptr sp, const InternalMmapVector ®isters, InternalMmapVector &extra_ranges, - Frontier *frontier) { + Frontier *frontier, Accessor &accessor) { // `extra_ranges` is outside of the function and the loop to reused mapped // memory. CHECK(extra_ranges.empty()); @@ -426,8 +450,8 @@ static void ProcessThread(tid_t os_id, uptr sp, uptr registers_begin = reinterpret_cast(registers.data()); uptr registers_end = reinterpret_cast(registers.data() + registers.size()); - ScanRangeForPointers(registers_begin, registers_end, frontier, "REGISTERS", - kReachable); + ScanForPointers(registers_begin, registers_end, frontier, "REGISTERS", + kReachable, accessor); } if (flags()->use_stacks) { @@ -451,9 +475,10 @@ static void ProcessThread(tid_t os_id, uptr sp, // Shrink the stack range to ignore out-of-scope values. stack_begin = sp; } - ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK", kReachable); + ScanForPointers(stack_begin, stack_end, frontier, "STACK", kReachable, + accessor); GetThreadExtraStackRangesLocked(os_id, &extra_ranges); - ScanExtraStackRanges(extra_ranges, frontier); + ScanExtraStack(extra_ranges, frontier, accessor); } if (flags()->use_tls) { @@ -463,21 +488,23 @@ static void ProcessThread(tid_t os_id, uptr sp, // otherwise, only scan the non-overlapping portions if (cache_begin == cache_end || tls_end < cache_begin || tls_begin > cache_end) { - ScanRangeForPointers(tls_begin, tls_end, frontier, "TLS", kReachable); + ScanForPointers(tls_begin, tls_end, frontier, "TLS", kReachable, + accessor); } else { if (tls_begin < cache_begin) - ScanRangeForPointers(tls_begin, cache_begin, frontier, "TLS", - kReachable); + ScanForPointers(tls_begin, cache_begin, frontier, "TLS", kReachable, + accessor); if (tls_end > cache_end) - ScanRangeForPointers(cache_end, tls_end, frontier, "TLS", kReachable); + ScanForPointers(cache_end, tls_end, frontier, "TLS", kReachable, + accessor); } } # if SANITIZER_ANDROID auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/, void *arg) -> void { - ScanRangeForPointers( + ScanForPointers( reinterpret_cast(dtls_begin), reinterpret_cast(dtls_end), - reinterpret_cast(arg), "DTLS", kReachable); + reinterpret_cast(arg), "DTLS", kReachable, accessor); }; // FIXME: There might be a race-condition here (and in Bionic) if the @@ -492,8 +519,8 @@ static void ProcessThread(tid_t os_id, uptr sp, if (dtls_beg < dtls_end) { LOG_THREADS("DTLS %d at %p-%p.\n", id, (void *)dtls_beg, (void *)dtls_end); - ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS", - kReachable); + ScanForPointers(dtls_beg, dtls_end, frontier, "DTLS", kReachable, + accessor); } }); } else { @@ -530,7 +557,8 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads, if (os_id == caller_tid) sp = caller_sp; - ProcessThread(os_id, sp, registers, extra_ranges, frontier); + DirectMemoryAccessor accessor; + ProcessThread(os_id, sp, registers, extra_ranges, frontier, accessor); } // Add pointers reachable from ThreadContexts -- GitLab From caa9e41814bf069dff2af015e2a710b559294e56 Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:51:21 -0700 Subject: [PATCH 096/511] [lld-macho] Fix category merging sed issue (#112955) Fix 'sed' spacing to ensure compatibility with all platforms. Original failure: https://lab.llvm.org/buildbot/#/builders/190/builds/7903 ``` RUN: at line 33: sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/ }' merge_cat_minimal.s > merge_cat_minimal_bad_name.s + sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/ }' merge_cat_minimal.s sed: 1: "/^__OBJC_\$_CATEGORY_My ...": bad flag in substitute command: '}' ``` --- lld/test/MachO/objc-category-merging-minimal.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/MachO/objc-category-merging-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s index 0fc785a4a9e4..437294791bf3 100644 --- a/lld/test/MachO/objc-category-merging-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -30,7 +30,7 @@ ############ Test merging skipped due to invalid category name ############ # Modify __OBJC_$_CATEGORY_MyBaseClass_$_Category01's name to point to L_OBJC_IMAGE_INFO+3 -# RUN: sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/ }' merge_cat_minimal.s > merge_cat_minimal_bad_name.s +# RUN: sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/}' merge_cat_minimal.s > merge_cat_minimal_bad_name.s # Assemble the modified source # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal_bad_name.o merge_cat_minimal_bad_name.s -- GitLab From 952dafb08ed2e97c647e925bf713eddb8dc07163 Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Oct 2024 14:56:23 -0400 Subject: [PATCH 097/511] [libc][math] Add test and fix atan2f crashing when flush-denorm-to-zero (FTZ) and denorm-as-zero (DAZ) modes are set. (#112828) --- libc/src/math/generic/atan2f.cpp | 14 +++++---- libc/test/UnitTest/FPMatcher.h | 26 +++++++++++++++++ libc/test/src/math/smoke/atan2f_test.cpp | 37 ++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp index e4b297c00f01..a2e5499809a3 100644 --- a/libc/src/math/generic/atan2f.cpp +++ b/libc/src/math/generic/atan2f.cpp @@ -246,12 +246,18 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) { uint32_t y_abs = y_bits.uintval(); uint32_t max_abs = x_abs > y_abs ? x_abs : y_abs; uint32_t min_abs = x_abs <= y_abs ? x_abs : y_abs; + float num_f = FPBits(min_abs).get_val(); + float den_f = FPBits(max_abs).get_val(); + double num_d = static_cast(num_f); + double den_d = static_cast(den_f); - if (LIBC_UNLIKELY(max_abs >= 0x7f80'0000U || min_abs == 0U)) { + if (LIBC_UNLIKELY(max_abs >= 0x7f80'0000U || num_d == 0.0)) { if (x_bits.is_nan() || y_bits.is_nan()) return FPBits::quiet_nan().get_val(); - size_t x_except = x_abs == 0 ? 0 : (x_abs == 0x7f80'0000 ? 2 : 1); - size_t y_except = y_abs == 0 ? 0 : (y_abs == 0x7f80'0000 ? 2 : 1); + double x_d = static_cast(x); + double y_d = static_cast(y); + size_t x_except = (x_d == 0.0) ? 0 : (x_abs == 0x7f80'0000 ? 2 : 1); + size_t y_except = (y_d == 0.0) ? 0 : (y_abs == 0x7f80'0000 ? 2 : 1); // Exceptional cases: // EXCEPT[y_except][x_except][x_is_neg] @@ -275,8 +281,6 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) { bool recip = x_abs < y_abs; double final_sign = IS_NEG[(x_sign != y_sign) != recip]; fputil::DoubleDouble const_term = CONST_ADJ[x_sign][y_sign][recip]; - double num_d = static_cast(FPBits(min_abs).get_val()); - double den_d = static_cast(FPBits(max_abs).get_val()); double q_d = num_d / den_d; double k_d = fputil::nearest_integer(q_d * 0x1.0p4f); diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index 07e2cd5df18c..e1a33ea326ec 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -16,6 +16,7 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/StringUtils.h" #include "test/UnitTest/Test.h" @@ -192,6 +193,31 @@ template struct FPTest : public Test { }; }; +// Add facility to test Flush-Denormal-To-Zero (FTZ) and Denormal-As-Zero (DAZ) +// modes. +// These tests to ensure that our implementations will not crash under these +// modes. +#if defined(LIBC_TARGET_ARCH_IS_X86_64) && __has_builtin(__builtin_ia32_stmxcsr) + +#define LIBC_TEST_FTZ_DAZ + +static constexpr unsigned FTZ = 0x8000; // Flush denormal to zero +static constexpr unsigned DAZ = 0x0040; // Denormal as zero + +struct ModifyMXCSR { + ModifyMXCSR(unsigned flags) { + old_mxcsr = __builtin_ia32_stmxcsr(); + __builtin_ia32_ldmxcsr(old_mxcsr | flags); + } + + ~ModifyMXCSR() { __builtin_ia32_ldmxcsr(old_mxcsr); } + +private: + unsigned old_mxcsr; +}; + +#endif + } // namespace testing } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp index 32a28cfdfeaa..94ec18d8f6b1 100644 --- a/libc/test/src/math/smoke/atan2f_test.cpp +++ b/libc/test/src/math/smoke/atan2f_test.cpp @@ -58,3 +58,40 @@ TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) { // EXPECT_FP_EXCEPTION(0); EXPECT_MATH_ERRNO(0); } + +#ifdef LIBC_TEST_FTZ_DAZ + +using namespace LIBC_NAMESPACE::testing; + +TEST_F(LlvmLibcAtan2fTest, FTZMode) { + ModifyMXCSR mxcsr(FTZ); + + EXPECT_FP_EQ(0x1.921fb6p-1f, + LIBC_NAMESPACE::atan2f(min_denormal, min_denormal)); + EXPECT_FP_EQ(0x1.000002p-23f, + LIBC_NAMESPACE::atan2f(min_denormal, max_denormal)); + EXPECT_FP_EQ(0x1.921fb4p0f, + LIBC_NAMESPACE::atan2f(max_denormal, min_denormal)); + EXPECT_FP_EQ(0x1.921fb6p-1f, + LIBC_NAMESPACE::atan2f(max_denormal, max_denormal)); +} + +TEST_F(LlvmLibcAtan2fTest, DAZMode) { + ModifyMXCSR mxcsr(DAZ); + + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(min_denormal, min_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(min_denormal, max_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(max_denormal, min_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(max_denormal, max_denormal)); +} + +TEST_F(LlvmLibcAtan2fTest, FTZDAZMode) { + ModifyMXCSR mxcsr(FTZ | DAZ); + + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(min_denormal, min_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(min_denormal, max_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(max_denormal, min_denormal)); + EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::atan2f(max_denormal, max_denormal)); +} + +#endif -- GitLab From 1ae24460d21577858d034fd4f77f2a986ac062a9 Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:21:01 -0700 Subject: [PATCH 098/511] [mlir] Add forall canonicalization to replace constant induction vars (#112764) Adds a canonicalization pattern for scf.forall that replaces constant induction variables with a constant index. There is a similar canonicalization that completely removes constant induction variables from the loop, but that pattern does not apply on foralls with mappings, so this one is necessary for those cases. --------- Signed-off-by: Max Dawkins --- mlir/lib/Dialect/SCF/IR/SCF.cpp | 28 ++++++++++++++++++++++++- mlir/test/Dialect/SCF/canonicalize.mlir | 4 +++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 2582d4e0df19..6678878215c1 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -1767,6 +1767,31 @@ struct ForallOpSingleOrZeroIterationDimsFolder } }; +/// Replace all induction vars with a single trip count with their lower bound. +struct ForallOpReplaceConstantInductionVar : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ForallOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + bool changed = false; + for (auto [lb, ub, step, iv] : + llvm::zip(op.getMixedLowerBound(), op.getMixedUpperBound(), + op.getMixedStep(), op.getInductionVars())) { + if (iv.getUses().begin() == iv.getUses().end()) + continue; + auto numIterations = constantTripCount(lb, ub, step); + if (!numIterations.has_value() || numIterations.value() != 1) { + continue; + } + rewriter.replaceAllUsesWith( + iv, getValueOrCreateConstantIndexOp(rewriter, loc, lb)); + changed = true; + } + return success(changed); + } +}; + struct FoldTensorCastOfOutputIntoForallOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -1851,7 +1876,8 @@ void ForallOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { results.add(context); + ForallOpSingleOrZeroIterationDimsFolder, + ForallOpReplaceConstantInductionVar>(context); } /// Given the region at `index`, or the parent operation if `index` is None, diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index c68369a8e4fc..8c4e7a41ee6b 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1617,7 +1617,7 @@ func.func @do_not_inline_distributed_forall_loop( %in: tensor<8x8xf32>) -> tensor<8x8xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<8x8xf32> - %1 = scf.forall (%i, %j) = (0, 0) to (1, 1) step (8, 8) + %1 = scf.forall (%i, %j) = (0, 4) to (1, 5) step (8, 8) shared_outs (%out_ = %0) -> (tensor<8x8xf32>) { %slice = tensor.extract_slice %out_[%i, %j] [2, 3] [1, 1] : tensor<8x8xf32> to tensor<2x3xf32> @@ -1632,6 +1632,8 @@ func.func @do_not_inline_distributed_forall_loop( } // CHECK-LABEL: @do_not_inline_distributed_forall_loop // CHECK: scf.forall +// CHECK: tensor.extract_slice %{{.*}}[0, 4] [2, 3] [1, 1] +// CHECK: tensor.parallel_insert_slice %{{.*}}[0, 4] [2, 3] [1, 1] // ----- -- GitLab From 53e85d44ad6f0973185fbe5d8d347905a1bdff1c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 12:35:59 -0700 Subject: [PATCH 099/511] [nfc][ubsan] Reorder RUNs and preconditions in test --- compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp index 2d65330ef289..e1a1554050eb 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp @@ -1,8 +1,10 @@ +// RUN: %clangxx -fsanitize=bool -static %s -o %t && env UBSAN_OPTIONS=handle_segv=0:handle_sigbus=0:handle_sigfpe=0 %run %t 2>&1 | FileCheck %s +// RUN: %run %t 2>&1 | FileCheck %s + // REQUIRES: ubsan-standalone // REQUIRES: target={{x86_64.*}} // UNSUPPORTED: i386-target-arch, internal_symbolizer -// RUN: %clangxx -fsanitize=bool -static %s -o %t && env UBSAN_OPTIONS=handle_segv=0:handle_sigbus=0:handle_sigfpe=0 %run %t 2>&1 | FileCheck %s -// RUN: %run %t 2>&1 | FileCheck %s + #include #include -- GitLab From 203b972289629bb506ef1f890458e8eff638e945 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 12:38:07 -0700 Subject: [PATCH 100/511] [ubsan] Disable test on Darwin Broken after #111497 --- compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp index e1a1554050eb..a716072ef483 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp @@ -5,6 +5,9 @@ // REQUIRES: target={{x86_64.*}} // UNSUPPORTED: i386-target-arch, internal_symbolizer +// Does not link. +// UNSUPPORTED: Darwin + #include #include -- GitLab From 9a4661cf31ea41143ee1c5a926a75320f91b1783 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 12:39:24 -0700 Subject: [PATCH 101/511] [nfc][ubsan] Fix case of UNSUPPORTED: --- compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp index a716072ef483..081eec049e3f 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp @@ -6,7 +6,7 @@ // UNSUPPORTED: i386-target-arch, internal_symbolizer // Does not link. -// UNSUPPORTED: Darwin +// UNSUPPORTED: darwin #include #include -- GitLab From 7f2e937469a8cec3fe977bf41ad2dfb9b4ce648a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 11:45:16 -0700 Subject: [PATCH 102/511] [SLP]Initial non-power-of-2 support (but still whole register) for reductions Enables initial non-power-of-2 support (but still requires number of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/112361 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 495 +++++++++++------- .../PhaseOrdering/AArch64/slpordering.ll | 42 +- .../SLPVectorizer/AArch64/loadorder.ll | 34 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 11 +- .../SLPVectorizer/AArch64/vec3-calls.ll | 3 +- .../X86/gather-node-same-as-vect-but-order.ll | 15 +- .../SLPVectorizer/X86/horizontal-list.ll | 32 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 10 +- .../X86/non-power-of-2-order-detection.ll | 9 +- .../X86/reorder_with_external_users.ll | 191 ------- .../SLPVectorizer/X86/vec3-calls.ll | 3 +- .../X86/vect-gather-same-nodes.ll | 14 +- 12 files changed, 391 insertions(+), 468 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ba70ab1e5e14..53632efe913e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -291,6 +291,8 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, if (NumParts == 0 || NumParts >= Sz) return bit_floor(Sz); unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); + if (RegVF > Sz) + return bit_floor(Sz); return (Sz / RegVF) * RegVF; } @@ -1505,6 +1507,12 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; + /// Checks if the graph and all its subgraphs cannot be better vectorized. + /// It may happen, if all gather nodes are loads and they cannot be + /// "clusterized". In this case even subgraphs cannot be vectorized more + /// effectively than the base graph. + bool isTreeNotExtendable() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, @@ -3047,7 +3055,9 @@ private: /// vector loads/masked gathers instead of regular gathers. Later these loads /// are reshufled to build final gathered nodes. void tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads); + const SmallMapVector, + SmallVector>>, + 8> &GatheredLoads); /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. @@ -3059,7 +3069,7 @@ private: /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. - DenseMap> + SmallVector> collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the @@ -4657,7 +4667,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes = true) { - if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) + if (getUnderlyingObject(Ptr1, RecursionMaxDepth) != + getUnderlyingObject(Ptr2, RecursionMaxDepth)) return false; auto *GEP1 = dyn_cast(Ptr1); auto *GEP2 = dyn_cast(Ptr2); @@ -5177,30 +5188,40 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, return LoadsState::Gather; } -static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, +static bool clusterSortPtrAccesses(ArrayRef VL, + ArrayRef BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { - assert(llvm::all_of( - VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && - "Expected list of pointer operands."); + assert( + all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each // Ptr into, sort and return the sorted indices with values next to one // another. - MapVector>> Bases; - Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); - - unsigned Cnt = 1; - for (Value *Ptr : VL.drop_front()) { - bool Found = any_of(Bases, [&](auto &Base) { - std::optional Diff = - getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, - /*StrictCheck=*/true); - if (!Diff) - return false; + SmallMapVector, + SmallVector>>, 8> + Bases; + Bases + .try_emplace(std::make_pair( + BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth))) + .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U); - Base.second.emplace_back(Ptr, *Diff, Cnt++); - return true; - }); + SortedIndices.clear(); + for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) { + auto Key = std::make_pair(BBs[Cnt + 1], + getUnderlyingObject(Ptr, RecursionMaxDepth)); + bool Found = any_of(Bases.try_emplace(Key).first->second, + [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { + std::optional Diff = getPointersDiff( + ElemTy, std::get<0>(Base.front()), ElemTy, + Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; + + Base.emplace_back(Ptr, *Diff, Cnt + 1); + return true; + }); if (!Found) { // If we haven't found enough to usefully cluster, return early. @@ -5208,71 +5229,39 @@ static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, return false; // Not found already - add a new Base - Bases[Ptr].emplace_back(Ptr, 0, Cnt++); + Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1); } } // For each of the bases sort the pointers by Offset and check if any of the // base become consecutively allocated. - bool AnyConsecutive = false; for (auto &Base : Bases) { - auto &Vec = Base.second; - if (Vec.size() > 1) { - llvm::stable_sort(Vec, [](const std::tuple &X, - const std::tuple &Y) { - return std::get<1>(X) < std::get<1>(Y); - }); - int InitialOffset = std::get<1>(Vec[0]); - AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { - return std::get<1>(P.value()) == int(P.index()) + InitialOffset; - }); + for (auto &Vec : Base.second) { + if (Vec.size() > 1) { + stable_sort(Vec, [](const std::tuple &X, + const std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + bool AnyConsecutive = + all_of(enumerate(Vec), [InitialOffset](const auto &P) { + return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + }); + // Fill SortedIndices array only if it looks worth-while to sort the + // ptrs. + if (!AnyConsecutive) + return false; + } } - } - - // Fill SortedIndices array only if it looks worth-while to sort the ptrs. - SortedIndices.clear(); - if (!AnyConsecutive) - return false; - - // If we have a better order, also sort the base pointers by increasing - // (variable) values if possible, to try and keep the order more regular. In - // order to create a valid strict-weak order we cluster by the Root of gep - // chains and sort within each. - SmallVector> SortedBases; - for (auto &Base : Bases) { - Value *Strip = Base.first->stripInBoundsConstantOffsets(); - Value *Root = Strip; - while (auto *Gep = dyn_cast(Root)) - Root = Gep->getOperand(0); - SortedBases.emplace_back(Base.first, Strip, Root); - } - auto *Begin = SortedBases.begin(); - auto *End = SortedBases.end(); - while (Begin != End) { - Value *Root = std::get<2>(*Begin); - auto *Mid = std::stable_partition( - Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; }); - DenseMap> LessThan; - for (auto *I = Begin; I < Mid; ++I) - LessThan.try_emplace(std::get<1>(*I)); - for (auto *I = Begin; I < Mid; ++I) { - Value *V = std::get<1>(*I); - while (auto *Gep = dyn_cast(V)) { - V = Gep->getOperand(0); - if (LessThan.contains(V)) - LessThan[V][std::get<1>(*I)] = true; - } - } - std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) { - return LessThan[std::get<1>(V1)][std::get<1>(V2)]; + sort(Base.second, [](const auto &V1, const auto &V2) { + return std::get<2>(V1.front()) < std::get<2>(V2.front()); }); - Begin = Mid; } - // Collect the final order of sorted indices - for (auto Base : SortedBases) - for (auto &T : Bases[std::get<0>(Base)]) - SortedIndices.push_back(std::get<2>(T)); + for (auto &T : Bases) + for (const auto &Vec : T.second) + for (const auto &P : Vec) + SortedIndices.push_back(std::get<2>(P)); assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); @@ -5286,15 +5275,18 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { SmallVector Ptrs; Ptrs.reserve(TE.Scalars.size()); + SmallVector BBs; + BBs.reserve(TE.Scalars.size()); for (Value *V : TE.Scalars) { auto *L = dyn_cast(V); if (!L || !L->isSimple()) return std::nullopt; Ptrs.push_back(L->getPointerOperand()); + BBs.push_back(L->getParent()); } BoUpSLP::OrdersType Order; - if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + if (clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order)) return std::move(Order); return std::nullopt; } @@ -5662,7 +5654,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars // has been auditted for correctness with non-power-of-two vectors. - if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) + if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; } @@ -6393,13 +6385,15 @@ void BoUpSLP::buildExternalUses( } } -DenseMap> +SmallVector> BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { - DenseMap> PtrToStoresMap; + SmallDenseMap, + SmallVector, 8> + PtrToStoresMap; for (unsigned Lane : seq(0, TE->Scalars.size())) { Value *V = TE->Scalars[Lane]; // Don't iterate over the users of constant data. - if (isa(V)) + if (!isa(V)) continue; // To save compilation time we don't visit if we have too many users. if (V->hasNUsesOrMore(UsesLimit)) @@ -6417,25 +6411,34 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { if (getTreeEntry(U)) continue; - Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); - auto &StoresVec = PtrToStoresMap[Ptr]; + Value *Ptr = + getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth); + auto &StoresVec = PtrToStoresMap[{SI->getParent(), + SI->getValueOperand()->getType(), Ptr}]; // For now just keep one store per pointer object per lane. // TODO: Extend this to support multiple stores per pointer per lane if (StoresVec.size() > Lane) continue; - // Skip if in different BBs. - if (!StoresVec.empty() && - SI->getParent() != StoresVec.back()->getParent()) - continue; - // Make sure that the stores are of the same type. - if (!StoresVec.empty() && - SI->getValueOperand()->getType() != - StoresVec.back()->getValueOperand()->getType()) - continue; + if (!StoresVec.empty()) { + std::optional Diff = getPointersDiff( + SI->getValueOperand()->getType(), SI->getPointerOperand(), + SI->getValueOperand()->getType(), + StoresVec.front()->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + // We failed to compare the pointers so just abandon this store. + if (!Diff) + continue; + } StoresVec.push_back(SI); } } - return PtrToStoresMap; + SmallVector> Res(PtrToStoresMap.size()); + unsigned I = 0; + for (auto &P : PtrToStoresMap) { + Res[I].swap(P.second); + ++I; + } + return Res; } bool BoUpSLP::canFormVector(ArrayRef StoresVec, @@ -6445,9 +6448,9 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, // To avoid calling getPointersDiff() while sorting we create a vector of // pairs {store, offset from first} and sort this instead. - SmallVector> StoreOffsetVec(StoresVec.size()); + SmallVector> StoreOffsetVec; StoreInst *S0 = StoresVec[0]; - StoreOffsetVec[0] = {S0, 0}; + StoreOffsetVec.emplace_back(0, 0); Type *S0Ty = S0->getValueOperand()->getType(); Value *S0Ptr = S0->getPointerOperand(); for (unsigned Idx : seq(1, StoresVec.size())) { @@ -6456,41 +6459,36 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); - // We failed to compare the pointers so just abandon this StoresVec. - if (!Diff) - return false; - StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; + StoreOffsetVec.emplace_back(*Diff, Idx); } - // Sort the vector based on the pointers. We create a copy because we may - // need the original later for calculating the reorder (shuffle) indices. - stable_sort(StoreOffsetVec, [](const std::pair &Pair1, - const std::pair &Pair2) { - int Offset1 = Pair1.second; - int Offset2 = Pair2.second; - return Offset1 < Offset2; - }); - // Check if the stores are consecutive by checking if their difference is 1. - for (unsigned Idx : seq(1, StoreOffsetVec.size())) - if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) + if (StoreOffsetVec.size() != StoresVec.size()) + return false; + sort(StoreOffsetVec, + [](const std::pair &L, + const std::pair &R) { return L.first < R.first; }); + unsigned Idx = 0; + int PrevDist = 0; + for (const auto &P : StoreOffsetVec) { + if (Idx > 0 && P.first != PrevDist + 1) return false; + PrevDist = P.first; + ++Idx; + } // Calculate the shuffle indices according to their offset against the sorted // StoreOffsetVec. - ReorderIndices.reserve(StoresVec.size()); - for (StoreInst *SI : StoresVec) { - unsigned Idx = find_if(StoreOffsetVec, - [SI](const std::pair &Pair) { - return Pair.first == SI; - }) - - StoreOffsetVec.begin(); - ReorderIndices.push_back(Idx); + ReorderIndices.assign(StoresVec.size(), 0); + bool IsIdentity = true; + for (auto [I, P] : enumerate(StoreOffsetVec)) { + ReorderIndices[P.second] = I; + IsIdentity &= P.second == I; } // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in // reorderTopToBottom() and reorderBottomToTop(), so we are following the // same convention here. - if (isIdentityOrder(ReorderIndices)) + if (IsIdentity) ReorderIndices.clear(); return true; @@ -6508,8 +6506,7 @@ SmallVector BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { unsigned NumLanes = TE->Scalars.size(); - DenseMap> PtrToStoresMap = - collectUserStores(TE); + SmallVector> Stores = collectUserStores(TE); // Holds the reorder indices for each candidate store vector that is a user of // the current TreeEntry. @@ -6518,8 +6515,7 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { // Now inspect the stores collected per pointer and look for vectorization // candidates. For each candidate calculate the reorder index vector and push // it into `ExternalReorderIndices` - for (const auto &Pair : PtrToStoresMap) { - auto &StoresVec = Pair.second; + for (ArrayRef StoresVec : Stores) { // If we have fewer than NumLanes stores, then we can't form a vector. if (StoresVec.size() != NumLanes) continue; @@ -6574,9 +6570,13 @@ static void gatherPossiblyVectorizableLoads( continue; bool IsFound = false; for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) { - if (LI->getParent() != Data.front().first->getParent() || - LI->getType() != Data.front().first->getType()) - continue; + assert(LI->getParent() == Data.front().first->getParent() && + LI->getType() == Data.front().first->getType() && + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) == + getUnderlyingObject(Data.front().first->getPointerOperand(), + RecursionMaxDepth) && + "Expected loads with the same type, same parent and same " + "underlying pointer."); std::optional Dist = getPointersDiff( LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), Data.front().first->getPointerOperand(), DL, SE, @@ -6704,7 +6704,9 @@ static void gatherPossiblyVectorizableLoads( } void BoUpSLP::tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads) { + const SmallMapVector, + SmallVector>>, + 8> &GatheredLoads) { GatheredLoadsEntriesFirst = VectorizableTree.size(); SmallVector> LoadSetsToVectorize( @@ -6737,7 +6739,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads( SmallVector CandidateVFs; if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1)) CandidateVFs.push_back(MaxVF); - for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) { + for (int NumElts = getFloorFullVectorNumberOfElements( + *TTI, Loads.front()->getType(), MaxVF); + NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( + *TTI, Loads.front()->getType(), NumElts - 1)) { CandidateVFs.push_back(NumElts); if (VectorizeNonPowerOf2 && NumElts > 2) CandidateVFs.push_back(NumElts - 1); @@ -6751,9 +6756,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads( if (Final && NumElts > BestVF) continue; SmallVector MaskedGatherVectorized; - for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; ++Cnt) { - ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); + ArrayRef Slice = + ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt)); if (VectorizedLoads.count(Slice.front()) || VectorizedLoads.count(Slice.back()) || areKnownNonVectorizableLoads(Slice)) @@ -7099,24 +7105,27 @@ void BoUpSLP::tryToVectorizeGatheredLoads( } return NonVectorized; }; - SmallVector NonVectorized = ProcessGatheredLoads(GatheredLoads); - if (!GatheredLoads.empty() && !NonVectorized.empty() && - std::accumulate( - GatheredLoads.begin(), GatheredLoads.end(), 0u, - [](unsigned S, ArrayRef> LoadsDists) { - return S + LoadsDists.size(); - }) != NonVectorized.size() && - IsMaskedGatherSupported(NonVectorized)) { - SmallVector>> FinalGatheredLoads; - for (LoadInst *LI : NonVectorized) { - // Reinsert non-vectorized loads to other list of loads with the same - // base pointers. - gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, - FinalGatheredLoads, - /*AddNew=*/false); - } - // Final attempt to vectorize non-vectorized loads. - (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); + for (const auto &GLs : GatheredLoads) { + const auto &Ref = GLs.second; + SmallVector NonVectorized = ProcessGatheredLoads(Ref); + if (!Ref.empty() && !NonVectorized.empty() && + std::accumulate( + Ref.begin(), Ref.end(), 0u, + [](unsigned S, ArrayRef> LoadsDists) { + return S + LoadsDists.size(); + }) != NonVectorized.size() && + IsMaskedGatherSupported(NonVectorized)) { + SmallVector>> FinalGatheredLoads; + for (LoadInst *LI : NonVectorized) { + // Reinsert non-vectorized loads to other list of loads with the same + // base pointers. + gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, + FinalGatheredLoads, + /*AddNew=*/false); + } + // Final attempt to vectorize non-vectorized loads. + (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); + } } // Try to vectorize postponed load entries, previously marked as gathered. for (unsigned Idx : LoadEntriesToVectorize) { @@ -7363,13 +7372,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) { assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); - if (S.MainOp->getType()->isFloatingPointTy() && - TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { - auto *I = dyn_cast(V); - return I && (I->isBinaryOp() || isa(I)) && !I->isFast(); - })) - return TreeEntry::NeedToGather; - unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto *VL0 = cast(S.OpValue); @@ -7534,6 +7536,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::Or: case Instruction::Xor: case Instruction::Freeze: + if (S.MainOp->getType()->isFloatingPointTy() && + TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { + auto *I = dyn_cast(V); + return I && I->isBinaryOp() && !I->isFast(); + })) + return TreeEntry::NeedToGather; return TreeEntry::Vectorize; case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. @@ -7625,6 +7633,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::NeedToGather; } case Instruction::Call: { + if (S.MainOp->getType()->isFloatingPointTy() && + TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { + auto *I = dyn_cast(V); + return I && !I->isFast(); + })) + return TreeEntry::NeedToGather; // Check if the calls are all to the same vectorizable intrinsic or // library function. CallInst *CI = cast(VL0); @@ -9344,8 +9358,13 @@ void BoUpSLP::transformNodes() { // insertvector instructions. unsigned StartIdx = 0; unsigned End = VL.size(); - for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) { - SmallVector Slices; + for (unsigned VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VL.size() - 1); + VF >= MinVF; VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VF - 1)) { + if (StartIdx + VF > End) + continue; + SmallVector> Slices; for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. @@ -9375,7 +9394,10 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice)) + if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || + (S.getOpcode() == Instruction::Load && + areKnownNonVectorizableLoads(Slice)) || + (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) continue; if (VF == 2) { // Try to vectorize reduced values or if all users are vectorized. @@ -9395,8 +9417,16 @@ void BoUpSLP::transformNodes() { canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); // Do not vectorize gathers. if (Res == LoadsState::ScatterVectorize || - Res == LoadsState::Gather) + Res == LoadsState::Gather) { + if (Res == LoadsState::Gather) { + registerNonVectorizableLoads(Slice); + // If reductions and the scalars from the root node are + // analyzed - mark as non-vectorizable reduction. + if (UserIgnoreList && E.Idx == 0) + analyzedReductionVals(Slice); + } continue; + } } else if (S.getOpcode() == Instruction::ExtractElement || (TTI->getInstructionCost( cast(Slice.front()), CostKind) < @@ -9411,17 +9441,17 @@ void BoUpSLP::transformNodes() { } } } - Slices.emplace_back(Cnt); + Slices.emplace_back(Cnt, Slice.size()); } - auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) { + auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); if (StartIdx == Cnt) - StartIdx = Cnt + VF; - if (End == Cnt + VF) + StartIdx = Cnt + Sz; + if (End == Cnt + Sz) End = Cnt; }; - for (unsigned Cnt : Slices) { - ArrayRef Slice = VL.slice(Cnt, VF); + for (auto [Cnt, Sz] : Slices) { + ArrayRef Slice = VL.slice(Cnt, Sz); // If any instruction is vectorized already - do not try again. if (TreeEntry *SE = getTreeEntry(Slice.front()); SE || getTreeEntry(Slice.back())) { @@ -9430,7 +9460,7 @@ void BoUpSLP::transformNodes() { if (VF != SE->getVectorFactor() || !SE->isSame(Slice)) continue; SE->UserTreeIndices.emplace_back(&E, UINT_MAX); - AddCombinedNode(SE->Idx, Cnt); + AddCombinedNode(SE->Idx, Cnt, Sz); continue; } unsigned PrevSize = VectorizableTree.size(); @@ -9442,12 +9472,14 @@ void BoUpSLP::transformNodes() { VectorizableTree[PrevSize]->getOpcode() != Instruction::ExtractElement && !isSplat(Slice)) { + if (UserIgnoreList && E.Idx == 0 && VF == 2) + analyzedReductionVals(Slice); VectorizableTree.pop_back(); assert(PrevEntriesSize == LoadEntriesToVectorize.size() && "LoadEntriesToVectorize expected to remain the same"); continue; } - AddCombinedNode(PrevSize, Cnt); + AddCombinedNode(PrevSize, Cnt, Sz); } } } @@ -9542,11 +9574,24 @@ void BoUpSLP::transformNodes() { VectorizableTree.front()->Scalars.size() == SmallVF) || (VectorizableTree.size() <= 2 && UserIgnoreList)) return; + + if (VectorizableTree.front()->isNonPowOf2Vec() && + getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && + getCanonicalGraphSize() <= SmallTree && + count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), + [](const std::unique_ptr &TE) { + return TE->isGather() && + TE->getOpcode() == Instruction::Load && + !allSameBlock(TE->Scalars); + }) == 1) + return; } // A list of loads to be gathered during the vectorization process. We can // try to vectorize them at the end, if profitable. - SmallVector>> GatheredLoads; + SmallMapVector, + SmallVector>>, 8> + GatheredLoads; for (std::unique_ptr &TE : VectorizableTree) { TreeEntry &E = *TE; @@ -9558,9 +9603,21 @@ void BoUpSLP::transformNodes() { !isVectorized(V) && !isDeleted(cast(V)); }))) && - !isSplat(E.Scalars)) - gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI, - GatheredLoads); + !isSplat(E.Scalars)) { + for (Value *V : E.Scalars) { + auto *LI = dyn_cast(V); + if (!LI) + continue; + if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple()) + continue; + gatherPossiblyVectorizableLoads( + *this, V, *DL, *SE, *TTI, + GatheredLoads[std::make_tuple( + LI->getParent(), + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth), + LI->getType())]); + } + } } // Try to vectorize gathered loads if this is not just a gather of loads. if (!GatheredLoads.empty()) @@ -11515,6 +11572,34 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } +bool BoUpSLP::isTreeNotExtendable() const { + if (getCanonicalGraphSize() != getTreeSize()) { + constexpr unsigned SmallTree = 3; + if (VectorizableTree.front()->isNonPowOf2Vec() && + getCanonicalGraphSize() <= SmallTree && + count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), + [](const std::unique_ptr &TE) { + return TE->isGather() && + TE->getOpcode() == Instruction::Load && + !allSameBlock(TE->Scalars); + }) == 1) + return true; + return false; + } + bool Res = false; + for (unsigned Idx : seq(getTreeSize())) { + TreeEntry &E = *VectorizableTree[Idx]; + if (!E.isGather()) + continue; + if (E.getOpcode() && E.getOpcode() != Instruction::Load) + return false; + if (isSplat(E.Scalars) || allConstant(E.Scalars)) + continue; + Res = true; + } + return Res; +} + InstructionCost BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, @@ -18747,7 +18832,8 @@ public: auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { Key = hash_combine(hash_value(LI->getParent()), Key); - Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); + Value *Ptr = + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); if (!LoadKeyUsed.insert(Key).second) { auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); if (LIt != LoadsMap.end()) { @@ -19070,8 +19156,28 @@ public: RegMaxNumber * RedValsMaxNumber); unsigned ReduxWidth = NumReducedVals; + auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { + unsigned NumParts, NumRegs; + Type *ScalarTy = Candidates.front()->getType(); + ReduxWidth = + getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + NumParts = TTI.getNumberOfParts(Tp); + NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + while (NumParts > NumRegs) { + ReduxWidth = bit_floor(ReduxWidth - 1); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + NumParts = TTI.getNumberOfParts(Tp); + NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + } + if (NumParts > NumRegs / 2) + ReduxWidth = bit_floor(ReduxWidth); + return ReduxWidth; + }; if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) - ReduxWidth = bit_floor(ReduxWidth); + ReduxWidth = GetVectorFactor(ReduxWidth); ReduxWidth = std::min(ReduxWidth, MaxElts); unsigned Start = 0; @@ -19079,10 +19185,7 @@ public: // Restarts vectorization attempt with lower vector factor. unsigned PrevReduxWidth = ReduxWidth; bool CheckForReusedReductionOpsLocal = false; - auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, - &CheckForReusedReductionOpsLocal, - &PrevReduxWidth, &V, - &IgnoreList](bool IgnoreVL = false) { + auto AdjustReducedVals = [&](bool IgnoreVL = false) { bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { // Check if any of the reduction ops are gathered. If so, worth @@ -19093,10 +19196,13 @@ public: if (Pos < NumReducedVals - ReduxWidth + 1) return IsAnyRedOpGathered; Pos = Start; - ReduxWidth = bit_ceil(ReduxWidth) / 2; + --ReduxWidth; + if (ReduxWidth > 1) + ReduxWidth = GetVectorFactor(ReduxWidth); return IsAnyRedOpGathered; }; bool AnyVectorized = false; + SmallDenseSet, 8> IgnoredCandidates; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -19108,8 +19214,15 @@ public: } PrevReduxWidth = ReduxWidth; ArrayRef VL(std::next(Candidates.begin(), Pos), ReduxWidth); - // Beeing analyzed already - skip. - if (V.areAnalyzedReductionVals(VL)) { + // Been analyzed already - skip. + if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) || + (!has_single_bit(ReduxWidth) && + (IgnoredCandidates.contains( + std::make_pair(Pos, bit_floor(ReduxWidth))) || + IgnoredCandidates.contains( + std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)), + bit_floor(ReduxWidth))))) || + V.areAnalyzedReductionVals(VL)) { (void)AdjustReducedVals(/*IgnoreVL=*/true); continue; } @@ -19215,8 +19328,24 @@ public: << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); }); - if (!AdjustReducedVals()) + if (!AdjustReducedVals()) { V.analyzedReductionVals(VL); + unsigned Offset = Pos == Start ? Pos : Pos - 1; + if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) { + // Add subvectors of VL to the list of the analyzed values. + for (unsigned VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), ReduxWidth - 1); + VF >= ReductionLimit; + VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VF - 1)) { + if (has_single_bit(VF) && + V.getCanonicalGraphSize() != V.getTreeSize()) + continue; + for (unsigned Idx : seq(ReduxWidth - VF)) + IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF)); + } + } + } continue; } @@ -19325,7 +19454,9 @@ public: } Pos += ReduxWidth; Start = Pos; - ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); + ReduxWidth = NumReducedVals - Pos; + if (ReduxWidth > 1) + ReduxWidth = GetVectorFactor(NumReducedVals - Pos); AnyVectorized = true; } if (OptReusedScalars && !AnyVectorized) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 212177522409..354791ddd6de 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -43,32 +43,32 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP34]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] @@ -86,19 +86,19 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> ; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] ; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] ; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] ; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> ; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], ; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], ; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 5f0b16048d40..b16164c4e5ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1215,26 +1215,26 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> @@ -1242,14 +1242,14 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> @@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index fffa626cae0d..c431b058f0d2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,18 +17,17 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 833bc56c4ec6..2191d04cd797 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POWER-OF-2-NEXT: entry: ; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) ; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POWER-OF-2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index 757d0b1708b6..234b65803238 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -11,19 +11,21 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0) ; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] -; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: entry.if.end72_crit_edge: ; CHECK-NEXT: br label [[IF_END72:%.*]] @@ -46,8 +48,7 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 72e29839230e..c9ff2d6426d2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -318,22 +318,14 @@ entry: define float @f(ptr nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] @@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) { ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) ; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index a7201e776fb4..0bc91d42b0f1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 -; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) -; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) -; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) +; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6) ; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) ; THRESH-NEXT: ret i32 [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll index 47dd84c7f6e9..4898111960c0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll @@ -7,10 +7,11 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C]], align 8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2 @@ -18,7 +19,7 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0) ; CHECK-NEXT: [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll index 93258f2975f3..2623b7689f4f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -8,197 +8,6 @@ ; into bb1, vectorizing all the way to the broadcast load at the top. ; The stores in bb1 are external to this tree, but they are vectorizable and are ; in reverse order. -define void @rotate_with_external_users(ptr %A, ptr %ptr) { -; CHECK-LABEL: @rotate_with_external_users( -; CHECK-NEXT: bb1: -; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8 -; CHECK-NEXT: br label [[BB2:%.*]] -; CHECK: bb2: -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] -; CHECK-NEXT: ret void -; -bb1: - %ld = load double, ptr undef - - %add1 = fadd double %ld, 1.1 - %add2 = fadd double %ld, 2.2 - - %mul1 = fmul double %add1, 1.1 - %mul2 = fmul double %add2, 2.2 - - ; Thes are external vectorizable stores with operands in reverse order. - %ptrA2 = getelementptr inbounds double, ptr %A, i64 1 - store double %mul2, ptr %A - store double %mul1, ptr %ptrA2 - br label %bb2 - -bb2: - %add3 = fadd double %mul1, 3.3 - %add4 = fadd double %mul2, 4.4 - %seed = fcmp ogt double %add3, %add4 - ret void -} - -; This checks that non-consecutive external users are skipped. -define void @non_consecutive_external_users(ptr %A, ptr %ptr) { -; CHECK-LABEL: @non_consecutive_external_users( -; CHECK-NEXT: bb1: -; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], -; CHECK-NEXT: [[PTRA4:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 -; CHECK-NEXT: store double [[TMP4]], ptr [[A]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 -; CHECK-NEXT: store double [[TMP5]], ptr [[A]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 -; CHECK-NEXT: store double [[TMP6]], ptr [[PTRA4]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 -; CHECK-NEXT: store double [[TMP7]], ptr [[PTRA4]], align 8 -; CHECK-NEXT: br label [[SEED_LOOP:%.*]] -; CHECK: seed_loop: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ] -; CHECK-NEXT: br label [[SEED_LOOP]] -; -bb1: - %ld = load double, ptr undef - - %add5 = fadd double %ld, 1.1 - %add6 = fadd double %ld, 2.2 - %add7 = fadd double %ld, 3.3 - %add8 = fadd double %ld, 4.4 - - %add1 = fadd double %add5, 1.1 - %add2 = fadd double %add6, 2.2 - %add3 = fadd double %add7, 3.3 - %add4 = fadd double %add8, 4.4 - - %mul1 = fmul double %add1, 1.1 - %mul2 = fmul double %add2, 2.2 - %mul3 = fmul double %add3, 3.3 - %mul4 = fmul double %add4, 4.4 - - ; External non-consecutive stores. - %ptrA4 = getelementptr inbounds double, ptr %A, i64 3 - store double %mul4, ptr %A - store double %mul3, ptr %A - store double %mul2, ptr %ptrA4 - store double %mul1, ptr %ptrA4 - br label %seed_loop - -seed_loop: - %phi1 = phi double [ %mul1, %bb1 ], [ 0.0, %seed_loop ] - %phi2 = phi double [ %mul2, %bb1 ], [ 0.0, %seed_loop ] - %phi3 = phi double [ %mul3, %bb1 ], [ 0.0, %seed_loop ] - %phi4 = phi double [ %mul4, %bb1 ], [ 0.0, %seed_loop ] - br label %seed_loop -} - -; We have to be careful when the tree contains add/sub patterns that could be -; combined into a single addsub instruction. Reordering can block the pattern. -define void @addsub_and_external_users(ptr %A, ptr %ptr) { -; CHECK-LABEL: @addsub_and_external_users( -; CHECK-NEXT: bb1: -; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: store <2 x double> [[SHUFFLE1]], ptr [[A:%.*]], align 8 -; CHECK-NEXT: br label [[BB2:%.*]] -; CHECK: bb2: -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] -; CHECK-NEXT: ret void -; -bb1: - %ld = load double, ptr undef - - %sub1 = fsub double %ld, 1.1 - %add2 = fadd double %ld, 1.2 - - %div1 = fdiv double %sub1, 2.1 - %div2 = fdiv double %add2, 2.2 - - %mul1 = fmul double %div1, 3.1 - %mul2 = fmul double %div2, 3.2 - - ; These are external vectorizable stores with operands in reverse order. - %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 - store double %mul2, ptr %A - store double %mul1, ptr %ptrA1 - br label %bb2 - -bb2: - %addS1 = fadd double %mul1, 4.1 - %addS2 = fadd double %mul2, 4.2 - %seed = fcmp ogt double %addS1, %addS2 - ret void -} - -; This contains a sub/add bundle, reordering it will make it better. -define void @subadd_and_external_users(ptr %A, ptr %ptr) { -; CHECK-LABEL: @subadd_and_external_users( -; CHECK-NEXT: bb1: -; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8 -; CHECK-NEXT: br label [[BB2:%.*]] -; CHECK: bb2: -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]] -; CHECK-NEXT: ret void -; -bb1: - %ld = load double, ptr undef - - %add1 = fadd double %ld, 1.1 - %sub2 = fsub double %ld, 1.2 - - %div1 = fdiv double %add1, 2.1 - %div2 = fdiv double %sub2, 2.2 - - %mul1 = fmul double %div1, 3.1 - %mul2 = fmul double %div2, 3.2 - - ; These are external vectorizable stores with operands in reverse order. - %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 - store double %mul2, ptr %A - store double %mul1, ptr %ptrA1 - br label %bb2 - -bb2: - %addS1 = fadd double %mul1, 4.1 - %addS2 = fadd double %mul2, 4.2 - %seed = fcmp ogt double %addS1, %addS2 - ret void -} - define void @alt_but_not_addsub_and_external_users(ptr %A, ptr %ptr) { ; CHECK-LABEL: @alt_but_not_addsub_and_external_users( ; CHECK-NEXT: bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index fd3c1a57aff3..a821362a883a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) ; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POW2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index e1b091cc6fcd..9719e60a6a69 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] -- GitLab From f148d5791bae39fdbe6c97559c82b6c6ab64a100 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 15:51:49 -0400 Subject: [PATCH 103/511] [LV]Initial support for safe distance in predicated DataWithEVL vectorization mode. Enabled initial support for max safe distance in DataWithEVL mode. If max safe distance is required, need to emit special code: CMP = icmp ult AVL, MAX_SAFE_DISTANCE SAFE_AVL = select CMP, AVL, MAX_SAFE_DISTANCE EVL = call i32 @llvm.experimental.get.vector.length(i64 SAFE_AVL) while vectorize the loop in DataWithEVL tail folding mode. Reviewers: fhahn Reviewed By: fhahn Pull Request: https://github.com/llvm/llvm-project/pull/102897 --- .../Transforms/Vectorize/LoopVectorize.cpp | 30 +++++++++++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 38 +++++++++++++++---- .../Transforms/Vectorize/VPlanTransforms.h | 4 +- ...e-force-tail-with-evl-safe-dep-distance.ll | 31 +++++++++------ 5 files changed, 81 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3e8bc1451f62..e377e1d82037 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1427,12 +1427,9 @@ public: // Override forced styles if needed. // FIXME: use actual opcode/data type for analysis here. // FIXME: Investigate opportunity for fixed vector factor. - bool EVLIsLegal = - IsScalableVF && UserIC <= 1 && - TTI.hasActiveVectorLength(0, nullptr, Align()) && - !EnableVPlanNativePath && - // FIXME: implement support for max safe dependency distance. - Legal->isSafeForAnyVectorWidth(); + bool EVLIsLegal = UserIC <= 1 && + TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath; if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail @@ -1457,6 +1454,15 @@ public: return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Return maximum safe number of elements to be processed per vector + /// iteration, which do not prevent store-load forwarding and are safe with + /// regard to the memory dependencies. Required for EVL-based VPlans to + /// correctly calculate AVL (application vector length) as min(remaining AVL, + /// MaxSafeElements). + /// TODO: need to consider adjusting cost model to use this value as a + /// vectorization factor for EVL-based vectorization. + std::optional getMaxSafeElements() const { return MaxSafeElements; } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1608,6 +1614,12 @@ private: /// true if scalable vectorization is supported and enabled. std::optional IsScalableVectorizationAllowed; + /// Maximum safe number of elements to be processed per vector iteration, + /// which do not prevent store-load forwarding and are safe with regard to the + /// memory dependencies. Required for EVL-based veectorization, where this + /// value is used as the upper bound of the safe AVL. + std::optional MaxSafeElements; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -3858,6 +3870,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + if (!Legal->isSafeForAnyVectorWidth()) + this->MaxSafeElements = MaxSafeElements; LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); @@ -8686,8 +8700,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::optimize(*Plan); // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && - !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) + if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( + *Plan, CM.getMaxSafeElements())) break; assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a38cdfc542cb..f4a1f58debba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -392,6 +392,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { return true; switch (Opcode) { case Instruction::ICmp: + case Instruction::Select: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: case VPInstruction::CalculateTripCountMinusVF: @@ -440,9 +441,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { - Value *Cond = State.get(getOperand(0)); - Value *Op1 = State.get(getOperand(1)); - Value *Op2 = State.get(getOperand(2)); + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); + Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { @@ -742,6 +744,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { default: return false; case Instruction::ICmp: + case Instruction::Select: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index faec08cac187..d50f3c0c3f3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1439,7 +1439,24 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { +/// If MaxSafeElements is provided, the function adds the following recipes: +/// vector.ph: +/// ... +/// +/// vector.body: +/// ... +/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], +/// [ %NextEVLIV, %vector.body ] +/// %AVL = sub original TC, %EVLPhi +/// %cmp = cmp ult %AVL, MaxSafeElements +/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements +/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL +/// ... +/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi +/// ... +/// +bool VPlanTransforms::tryAddExplicitVectorLength( + VPlan &Plan, const std::optional &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); // The transform updates all users of inductions to work based on EVL, instead // of the VF directly. At the moment, widened inductions cannot be updated, so @@ -1464,14 +1481,19 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); EVLPhi->insertAfter(CanonicalIVPHI); - // TODO: Add support for MaxSafeDist for correct loop emission. + VPBuilder Builder(Header, Header->getFirstNonPhi()); // Compute original TC - IV as the AVL (application vector length). - auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi}, - DebugLoc(), "avl"); - AVL->insertBefore(*Header, Header->getFirstNonPhi()); - auto *VPEVL = - new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc()); - VPEVL->insertAfter(AVL); + VPValue *AVL = Builder.createNaryOp( + Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl"); + if (MaxSafeElements) { + // Support for MaxSafeDist for correct loop emission. + VPValue *AVLSafe = Plan.getOrAddLiveIn( + ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements)); + VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); + AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); + } + auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, + DebugLoc()); auto *CanonicalIVIncrement = cast(CanonicalIVPHI->getBackedgeValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3b792ee32dce..60a44bfb0dca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -108,7 +108,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. /// \returns true if the transformation succeeds, or false if it doesn't. - static bool tryAddExplicitVectorLength(VPlan &Plan); + static bool + tryAddExplicitVectorLength(VPlan &Plan, + const std::optional &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll index 2dd47d5c1ea8..322a6c16871a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll @@ -422,28 +422,37 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024 +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true) +; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison) +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024 ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]] ; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 -; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]]) -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004 -; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -- GitLab From 98e838a890191b9250ad33741a1c121a9591caa3 Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:02:03 -0700 Subject: [PATCH 104/511] [mlir] Do not bufferize parallel_insert_slice dest to read for full slices (#112761) In the insert_slice bufferization interface implementation, the destination tensor is not considered read if the full tensor is overwritten by the slice. This PR adds the same check for tensor.parallel_insert_slice. Adds two new StaticValueUtils: - `isAllConstantIntValue` checks if an array of `OpFoldResult` are all equal to a passed `int64_t` value. - `areConstantIntValues` checks if an array of `OpFoldResult` are all equal to a passed array of `int64_t` values. fixes https://github.com/llvm/llvm-project/issues/112435 --------- Signed-off-by: Max Dawkins --- .../mlir/Dialect/Utils/StaticValueUtils.h | 6 +++ .../BufferizableOpInterfaceImpl.cpp | 54 +++++++++---------- .../Transforms/PackAndUnpackPatterns.cpp | 5 -- mlir/lib/Dialect/Utils/StaticValueUtils.cpp | 15 +++++- .../Dialect/Tensor/one-shot-bufferize.mlir | 15 ++++++ 5 files changed, 62 insertions(+), 33 deletions(-) diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h index ba4f084d3efd..4d7aa1ae17fd 100644 --- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h +++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h @@ -92,6 +92,12 @@ getConstantIntValues(ArrayRef ofrs); /// Return true if `ofr` is constant integer equal to `value`. bool isConstantIntValue(OpFoldResult ofr, int64_t value); +/// Return true if all of `ofrs` are constant integers equal to `value`. +bool areAllConstantIntValue(ArrayRef ofrs, int64_t value); +/// Return true if all of `ofrs` are constant integers equal to the +/// corresponding value in `values`. +bool areConstantIntValues(ArrayRef ofrs, + ArrayRef values); /// Return true if ofr1 and ofr2 are the same integer constant attribute /// values or the same SSA value. Ignore integer bitwitdh and type mismatch diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index 87464ccb7172..c2b8614148bf 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/Operation.h" @@ -636,6 +637,28 @@ struct InsertOpInterface } }; +template +static bool insertSliceOpRequiresRead(InsertOpTy insertSliceOp, + OpOperand &opOperand) { + // The source is always read. + if (opOperand == insertSliceOp.getSourceMutable()) + return true; + + // For the destination, it depends... + assert(opOperand == insertSliceOp.getDestMutable() && "expected dest"); + + // Dest is not read if it is entirely overwritten. E.g.: + // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32> + bool allOffsetsZero = + llvm::all_of(insertSliceOp.getMixedOffsets(), isZeroIndex); + RankedTensorType destType = insertSliceOp.getDestType(); + bool sizesMatchDestSizes = + areConstantIntValues(insertSliceOp.getMixedSizes(), destType.getShape()); + bool allStridesOne = + areAllConstantIntValue(insertSliceOp.getMixedStrides(), 1); + return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne); +} + /// Bufferization of tensor.insert_slice. Replace with a memory copy. Under /// certain circumstances, this op can also be a no-op. /// @@ -646,32 +669,8 @@ struct InsertSliceOpInterface tensor::InsertSliceOp> { bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { - auto insertSliceOp = cast(op); - RankedTensorType destType = insertSliceOp.getDestType(); - - // The source is always read. - if (opOperand == insertSliceOp.getSourceMutable()) - return true; - - // For the destination, it depends... - assert(opOperand == insertSliceOp.getDestMutable() && "expected dest"); - - // Dest is not read if it is entirely overwritten. E.g.: - // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32> - bool allOffsetsZero = - llvm::all_of(insertSliceOp.getMixedOffsets(), [](OpFoldResult ofr) { - return isConstantIntValue(ofr, 0); - }); - bool sizesMatchDestSizes = llvm::all_of( - llvm::enumerate(insertSliceOp.getMixedSizes()), [&](const auto &it) { - return getConstantIntValue(it.value()) == - destType.getDimSize(it.index()); - }); - bool allStridesOne = - llvm::all_of(insertSliceOp.getMixedStrides(), [](OpFoldResult ofr) { - return isConstantIntValue(ofr, 1); - }); - return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne); + return insertSliceOpRequiresRead(cast(op), + opOperand); } LogicalResult bufferize(Operation *op, RewriterBase &rewriter, @@ -931,7 +930,8 @@ struct ParallelInsertSliceOpInterface bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { - return true; + return insertSliceOpRequiresRead(cast(op), + opOperand); } bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand, diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp index 995486c87771..3566714c6529 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp @@ -16,11 +16,6 @@ namespace mlir { namespace tensor { namespace { -static bool areAllConstantIntValue(ArrayRef ofrs, int64_t value) { - return llvm::all_of( - ofrs, [&](OpFoldResult ofr) { return isConstantIntValue(ofr, value); }); -} - /// Returns the number of shape sizes that is either dynamic or greater than 1. static int64_t getNumGtOneDims(ArrayRef shape) { return llvm::count_if( diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp index 547d120404ab..3eb6215a7a0b 100644 --- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp +++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Matchers.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/APSInt.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" namespace mlir { @@ -131,12 +132,24 @@ getConstantIntValues(ArrayRef ofrs) { return res; } -/// Return true if `ofr` is constant integer equal to `value`. bool isConstantIntValue(OpFoldResult ofr, int64_t value) { auto val = getConstantIntValue(ofr); return val && *val == value; } +bool areAllConstantIntValue(ArrayRef ofrs, int64_t value) { + return llvm::all_of( + ofrs, [&](OpFoldResult ofr) { return isConstantIntValue(ofr, value); }); +} + +bool areConstantIntValues(ArrayRef ofrs, + ArrayRef values) { + if (ofrs.size() != values.size()) + return false; + std::optional> constOfrs = getConstantIntValues(ofrs); + return constOfrs && llvm::equal(constOfrs.value(), values); +} + /// Return true if ofr1 and ofr2 are the same integer constant attribute values /// or the same SSA value. /// Ignore integer bitwidth and type mismatch that come from the fact there is diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir index e2169fe1404c..dc4306b8316a 100644 --- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir @@ -213,6 +213,21 @@ func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tenso // ----- +// CHECK-LABEL: func.func @parallel_insert_full_slice_in_place +// CHECK-NOT: memref.alloc() +func.func @parallel_insert_full_slice_in_place(%2: tensor<2xf32>) -> tensor<2xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %3 = scf.forall (%arg0) in (1) shared_outs(%arg2 = %2) -> (tensor<2xf32>) { + %fill = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<2xf32>) -> tensor<2xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %fill into %arg2[0] [2] [1] : tensor<2xf32> into tensor<2xf32> + } + } {mapping = [#gpu.thread]} + return %3 : tensor<2xf32> +} + +// ----- + // This test case could bufferize in-place with a better analysis. However, it // is simpler to let the canonicalizer fold away the tensor.insert_slice. -- GitLab From 2bff9d9ffe3a4813961c1cf3af2e9ac5a20190bd Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:11:21 -0700 Subject: [PATCH 105/511] [mlir] Don't hoist transfers from potentially zero trip loops (#112752) The hoistRedundantVectorTransfers function does not verification of loop bounds when hoisting vector transfers. This is not safe in general, since it is possible that the loop will have zero trip count. This PR uses ValueBounds to verify that the lower bound is less than the upper bound of the loop before hoisting. Trip count verification is currently behind an option `verifyNonZeroTrip`, which is false by default. Zero trip count loops can arise in GPU code generation, where a loop bound can be dependent on a thread id. If not all threads execute the loop body, then hoisting out of the loop can cause these threads to execute the transfers when they are not supposed to. --------- Signed-off-by: Max Dawkins --- .../Linalg/TransformOps/LinalgTransformOps.td | 6 +- .../mlir/Dialect/Linalg/Transforms/Hoisting.h | 10 +- .../TransformOps/LinalgTransformOps.cpp | 2 +- .../Dialect/Linalg/Transforms/Hoisting.cpp | 46 ++++++- mlir/test/Dialect/Linalg/hoisting.mlir | 130 +++++++++++++++++- 5 files changed, 188 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index 0915bbde3072..040c04b0410e 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -2294,13 +2294,15 @@ def HoistRedundantVectorTransfersOp : function op. }]; - let arguments = (ins TransformHandleTypeInterface:$target); + let arguments = (ins TransformHandleTypeInterface:$target, + UnitAttr:$verify_non_zero_trip); let results = (outs TransformHandleTypeInterface:$transformed); let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) "; let builders = [ - OpBuilder<(ins "Value":$target)>, + OpBuilder<(ins "Value":$target, + CArg<"bool", "false">:$verify_non_zero_trip)>, ]; let extraClassDeclaration = [{ ::mlir::DiagnosedSilenceableFailure applyToOne( diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h index 236c2ce7d48e..4edf432d9d97 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h @@ -29,6 +29,9 @@ namespace linalg { /// 4. The source operands for vector.transfer_{read|write} do not originate /// from Ops implementing ViewLikeOpInterface (to reduce the risk of /// aliasing). +/// 5. If `verifyNonZeroTrip` is true, then the lower bound of the loop must +/// be statically smaller than the upper bound of the loop, guaranteeing that +/// the loop body will execute at least once. /// To improve hoisting opportunities, call the `moveLoopInvariantCode` helper /// function on the candidate loop above which to hoist. Hoisting the transfers /// results in scf::ForOp yielding the value that originally transited through @@ -41,7 +44,12 @@ namespace linalg { /// /// WARNING: This hoisting does not model parallelism and is generally incorrect /// when used on distributed loops with memref semantics! -void hoistRedundantVectorTransfers(Operation *root); +/// NOTE: Setting `verifyNonZeroTrip = true` makes this more stable for +/// distributed loops with memref semantics, but there could still be some +/// issues when loops are executed a different number of times for different +/// threads. +void hoistRedundantVectorTransfers(Operation *root, + bool verifyNonZeroTrip = false); /// Hoist vector.extract/vector.broadcast pairs out of immediately enclosing /// scf::ForOp iteratively, if the following conditions are met: diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index ad72b5d7becc..1f1d8ad89ae2 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3558,7 +3558,7 @@ transform::HoistRedundantVectorTransfersOp::applyToOne( // WARNING: This hoisting does not model parallelism and is generally // incorrect when used on distributed loops with memref semantics! // TODO: obsolete and should be retired. - linalg::hoistRedundantVectorTransfers(target); + linalg::hoistRedundantVectorTransfers(target, getVerifyNonZeroTrip()); results.push_back(target); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index 94f6b6029875..acfd9683f01f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -199,7 +199,8 @@ static bool noAliasingUseInLoop(vector::TransferReadOp transferRead, return true; } -void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) { +void mlir::linalg::hoistRedundantVectorTransfers(Operation *root, + bool verifyNonZeroTrip) { bool changed = true; while (changed) { changed = false; @@ -208,6 +209,43 @@ void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) { root->walk( [&](LoopLikeOpInterface loopLike) { moveLoopInvariantCode(loopLike); }); + // Find all loops that are certain to have non zero trip count. Any loops + // that are not part of this set cannot be hoisted from, since hoisting from + // a potentially zero trip count loop may cause a vector transfer to be + // executed when it shouldn't be. + llvm::DenseSet definiteNonZeroTripCountLoops; + if (verifyNonZeroTrip) { + root->walk([&](LoopLikeOpInterface loopLike) { + std::optional> lbs = + loopLike.getLoopLowerBounds(); + std::optional> ubs = + loopLike.getLoopUpperBounds(); + // If loop bounds cannot be found, assume possibly zero trip count. + if (!lbs || !ubs) + return; + + // Otherwise, use ValueBounds to find the maximum lower bound and + // minimum upper bound. If the bounds are found, and maxLb is less + // than the minUb, then the loop will not have zero trip count. + for (auto [lb, ub] : llvm::zip_equal(lbs.value(), ubs.value())) { + FailureOr maxLb = + ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, lb, + /*stopCondition=*/nullptr, /*closedUB=*/true); + if (failed(maxLb)) + return; + FailureOr minUb = + ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::LB, ub); + if (failed(minUb)) + return; + if (minUb.value() <= maxLb.value()) + return; + definiteNonZeroTripCountLoops.insert(loopLike); + } + }); + } + root->walk([&](vector::TransferReadOp transferRead) { if (!isa(transferRead.getShapedType())) return WalkResult::advance(); @@ -220,6 +258,12 @@ void mlir::linalg::hoistRedundantVectorTransfers(Operation *root) { if (!isa_and_nonnull(loop)) return WalkResult::advance(); + if (verifyNonZeroTrip && !definiteNonZeroTripCountLoops.contains(loop)) { + LLVM_DEBUG(DBGS() << "Loop may have zero trip count: " << *loop + << "\n"); + return WalkResult::advance(); + } + LLVM_DEBUG(DBGS() << "Candidate read: " << *transferRead.getOperation() << "\n"); diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir index 241b8a486c01..4e1035e038ca 100644 --- a/mlir/test/Dialect/Linalg/hoisting.mlir +++ b/mlir/test/Dialect/Linalg/hoisting.mlir @@ -308,6 +308,134 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: func.func @no_hoisting_unknown_bound_loop +func.func @no_hoisting_unknown_bound_loop(%memref0: memref<20xi32>, %lb: index, %ub: index) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // %lb and %ub are unbounded, so do not hoist. + // CHECK: scf.for {{.*}} { + // CHECK-NEXT: vector.transfer_read + // CHECK-NEXT: "test.some_use" + scf.for %arg2 = %lb to %ub step %c1 { + %read = vector.transfer_read %memref0[%c0], %c0_i32 {in_bounds = [true]} : memref<20xi32>, vector<4xi32> + "test.some_use"(%read) : (vector<4xi32>) ->() + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.hoist_redundant_vector_transfers %0 { verify_non_zero_trip } + : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func.func @no_hoisting_possibly_zero_trip_loop +func.func @no_hoisting_possibly_zero_trip_loop(%memref0: memref<20xi32>, %lb: index, %ub: index) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // %lb_0 is in range [%lb, 8], and %ub_0 is in range [4, %ub]. + // Since %lb_0 could be greater than %ub_0, do not hoist. + %lb_0 = affine.min affine_map<(d0) -> (d0, 8)>(%lb) + %ub_0 = affine.max affine_map<(d0) -> (d0, 4)>(%ub) + + // CHECK: scf.for {{.*}} { + // CHECK-NEXT: vector.transfer_read + // CHECK-NEXT: "test.some_use" + scf.for %arg2 = %lb_0 to %ub_0 step %c1 { + %read = vector.transfer_read %memref0[%c0], %c0_i32 {in_bounds = [true]} : memref<20xi32>, vector<4xi32> + "test.some_use"(%read) : (vector<4xi32>) ->() + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.hoist_redundant_vector_transfers %0 { verify_non_zero_trip } + : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func.func @no_hoisting_possibly_zero_trip_loop_eq_lb_and_ub +func.func @no_hoisting_possibly_zero_trip_loop_eq_lb_and_ub(%memref0: memref<20xi32>, %lb: index, %ub: index) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // %lb_0 is in range [%lb, 8], and %ub_0 is in range [8, %ub]. + // Since %lb_0 could be equal to %ub_0, do not hoist. + %lb_0 = affine.min affine_map<(d0) -> (d0, 8)>(%lb) + %ub_0 = affine.max affine_map<(d0) -> (d0, 8)>(%ub) + + // CHECK: scf.for {{.*}} { + // CHECK-NEXT: vector.transfer_read + // CHECK-NEXT: "test.some_use" + scf.for %arg2 = %lb_0 to %ub_0 step %c1 { + %read = vector.transfer_read %memref0[%c0], %c0_i32 {in_bounds = [true]} : memref<20xi32>, vector<4xi32> + "test.some_use"(%read) : (vector<4xi32>) ->() + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.hoist_redundant_vector_transfers %0 { verify_non_zero_trip } + : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func.func @hoisting_non_zero_trip_loop +func.func @hoisting_non_zero_trip_loop(%memref0: memref<20xi32>, %lb: index, %ub: index) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // %lb_0 is in range [%lb, 4], and %ub_0 is in range [8, %ub]. + // Since %lb_0 is guaranteed to be less than %ub_0, hoisting is possible. + %lb_0 = affine.min affine_map<(d0) -> (d0, 4)>(%lb) + %ub_0 = affine.max affine_map<(d0) -> (d0, 8)>(%ub) + + // CHECK: vector.transfer_read + // CHECK: scf.for {{.*}} { + // CHECK-NEXT: "test.some_use" + scf.for %arg2 = %lb_0 to %ub_0 step %c1 { + %read = vector.transfer_read %memref0[%c0], %c0_i32 {in_bounds = [true]} : memref<20xi32>, vector<4xi32> + "test.some_use"(%read) : (vector<4xi32>) ->() + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.hoist_redundant_vector_transfers %0 { verify_non_zero_trip } + : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + // Regression test - `vector.transfer_read` below should not be hoisted. // Indeed, %collapse_shape (written to by `vector.transfer_write`) and %alloca // (read by `vector.transfer_read`) alias. @@ -366,7 +494,7 @@ func.func @no_hoisting_collapse_shape_2(%vec: vector<1x12x1xi32>) { %collapse_shape = memref.collapse_shape %alloca [[0, 1, 2]] : memref<1x12x1xi32> into memref<12xi32> vector.transfer_write %vec, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x12x1xi32>, memref<1x12x1xi32> %read = vector.transfer_read %collapse_shape[%c0], %c0_i32 {in_bounds = [true]} : memref<12xi32>, vector<12xi32> - "prevent.dce"(%read) : (vector<12xi32>) ->() + "test.some_use"(%read) : (vector<12xi32>) ->() } return } -- GitLab From e669bbbb7265a7d4d59bac2d3889194efa167ea8 Mon Sep 17 00:00:00 2001 From: Luke Drummond Date: Fri, 18 Oct 2024 21:16:24 +0100 Subject: [PATCH 106/511] Revert "Finally formalise our defacto line-ending policy" This reverts commit dccebddb3b802c4c1fe287222e454b63f850f012. --- .gitattributes | 7 ------- clang-tools-extra/clangd/test/.gitattributes | 3 --- clang/test/.gitattributes | 4 ---- llvm/docs/TestingGuide.rst | 6 ------ llvm/test/FileCheck/.gitattributes | 1 - llvm/test/tools/llvm-ar/Inputs/.gitattributes | 1 - llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes | 1 - 7 files changed, 23 deletions(-) delete mode 100644 clang-tools-extra/clangd/test/.gitattributes delete mode 100644 clang/test/.gitattributes delete mode 100644 llvm/test/FileCheck/.gitattributes delete mode 100644 llvm/test/tools/llvm-ar/Inputs/.gitattributes delete mode 100644 llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes diff --git a/.gitattributes b/.gitattributes index aced01d485c1..6b281f33f737 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,10 +1,3 @@ -# Checkout as native, commit as LF except in specific circumstances -* text=auto -*.bat text eol=crlf -*.rc text eol=crlf -*.sln text eol=crlf -*.natvis text eol=crlf - libcxx/src/**/*.cpp merge=libcxx-reformat libcxx/include/**/*.h merge=libcxx-reformat diff --git a/clang-tools-extra/clangd/test/.gitattributes b/clang-tools-extra/clangd/test/.gitattributes deleted file mode 100644 index 20971adc2b5d..000000000000 --- a/clang-tools-extra/clangd/test/.gitattributes +++ /dev/null @@ -1,3 +0,0 @@ -input-mirror.test text eol=crlf -too_large.test text eol=crlf -protocol.test text eol=crlf diff --git a/clang/test/.gitattributes b/clang/test/.gitattributes deleted file mode 100644 index 160fc6cf5617..000000000000 --- a/clang/test/.gitattributes +++ /dev/null @@ -1,4 +0,0 @@ -FixIt/fixit-newline-style.c text eol=crlf -Frontend/system-header-line-directive-ms-lineendings.c text eol=crlf -Frontend/rewrite-includes-mixed-eol-crlf.* text eol=crlf -clang/test/Frontend/rewrite-includes-mixed-eol-lf.h text eolf=lf diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 344a295226f6..08617933519f 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -360,12 +360,6 @@ Best practices for regression tests - Try to give values (including variables, blocks and functions) meaningful names, and avoid retaining complex names generated by the optimization pipeline (such as ``%foo.0.0.0.0.0.0``). -- If your tests depend on specific input file encodings, beware of line-ending - issues across different platforms, and in the project's history. Before you - commit tests that depend on explicit encodings, consider adding filetype or - specific line-ending annotations to a `<.gitattributes - https://git-scm.com/docs/gitattributes#_effects>`_ file in the appropriate - directory in the repository. Extra files ----------- diff --git a/llvm/test/FileCheck/.gitattributes b/llvm/test/FileCheck/.gitattributes deleted file mode 100644 index ba27d7fad76d..000000000000 --- a/llvm/test/FileCheck/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -dos-style-eol.txt text eol=crlf diff --git a/llvm/test/tools/llvm-ar/Inputs/.gitattributes b/llvm/test/tools/llvm-ar/Inputs/.gitattributes deleted file mode 100644 index 6c8a26285daf..000000000000 --- a/llvm/test/tools/llvm-ar/Inputs/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -mri-crlf.mri text eol=crlf diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes b/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes deleted file mode 100644 index 2df17345df5b..000000000000 --- a/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.dos text eol=crlf -- GitLab From b55c52c047a167f42abbde9a33356cfb96b82c7f Mon Sep 17 00:00:00 2001 From: Luke Drummond Date: Fri, 18 Oct 2024 21:16:50 +0100 Subject: [PATCH 107/511] Revert "Renormalize line endings whitespace only after dccebddb3b80" This reverts commit 9d98acb196a40fee5229afeb08f95fd36d41c10a. --- .../clangd/test/input-mirror.test | 34 +- clang-tools-extra/clangd/test/protocol.test | 226 +- clang-tools-extra/clangd/test/too_large.test | 14 +- clang/test/AST/HLSL/StructuredBuffer-AST.hlsl | 128 +- clang/test/C/C2y/n3262.c | 40 +- clang/test/C/C2y/n3274.c | 36 +- .../StructuredBuffer-annotations.hlsl | 44 +- .../StructuredBuffer-elementtype.hlsl | 140 +- .../builtins/StructuredBuffer-subscript.hlsl | 34 +- clang/test/CodeGenHLSL/builtins/atan2.hlsl | 118 +- clang/test/CodeGenHLSL/builtins/cross.hlsl | 74 +- clang/test/CodeGenHLSL/builtins/length.hlsl | 146 +- .../test/CodeGenHLSL/builtins/normalize.hlsl | 170 +- clang/test/CodeGenHLSL/builtins/step.hlsl | 168 +- clang/test/Driver/flang/msvc-link.f90 | 10 +- clang/test/FixIt/fixit-newline-style.c | 22 +- .../rewrite-includes-mixed-eol-crlf.c | 16 +- .../rewrite-includes-mixed-eol-crlf.h | 22 +- ...tem-header-line-directive-ms-lineendings.c | 42 +- clang/test/ParserHLSL/bitfields.hlsl | 60 +- .../hlsl_annotations_on_struct_members.hlsl | 42 +- .../ParserHLSL/hlsl_contained_type_attr.hlsl | 50 +- .../hlsl_contained_type_attr_error.hlsl | 56 +- clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl | 44 +- .../ParserHLSL/hlsl_is_rov_attr_error.hlsl | 40 +- .../test/ParserHLSL/hlsl_raw_buffer_attr.hlsl | 44 +- .../hlsl_raw_buffer_attr_error.hlsl | 34 +- .../ParserHLSL/hlsl_resource_class_attr.hlsl | 74 +- .../hlsl_resource_class_attr_error.hlsl | 44 +- .../hlsl_resource_handle_attrs.hlsl | 42 +- clang/test/Sema/aarch64-sve-vector-trig-ops.c | 130 +- clang/test/Sema/riscv-rvv-vector-trig-ops.c | 134 +- .../avail-diag-default-compute.hlsl | 238 +- .../Availability/avail-diag-default-lib.hlsl | 360 +- .../avail-diag-relaxed-compute.hlsl | 238 +- .../Availability/avail-diag-relaxed-lib.hlsl | 324 +- .../avail-diag-strict-compute.hlsl | 256 +- .../Availability/avail-diag-strict-lib.hlsl | 384 +- .../avail-lib-multiple-stages.hlsl | 114 +- .../SemaHLSL/BuiltIns/StructuredBuffers.hlsl | 38 +- .../test/SemaHLSL/BuiltIns/cross-errors.hlsl | 86 +- .../BuiltIns/half-float-only-errors2.hlsl | 26 +- .../test/SemaHLSL/BuiltIns/length-errors.hlsl | 64 +- .../SemaHLSL/BuiltIns/normalize-errors.hlsl | 62 +- clang/test/SemaHLSL/BuiltIns/step-errors.hlsl | 62 +- .../Types/Traits/IsIntangibleType.hlsl | 162 +- .../Types/Traits/IsIntangibleTypeErrors.hlsl | 24 +- .../resource_binding_attr_error_basic.hlsl | 84 +- .../resource_binding_attr_error_other.hlsl | 18 +- .../resource_binding_attr_error_resource.hlsl | 98 +- ...urce_binding_attr_error_silence_diags.hlsl | 54 +- .../resource_binding_attr_error_space.hlsl | 124 +- .../resource_binding_attr_error_udt.hlsl | 270 +- clang/tools/scan-build/bin/scan-build.bat | 2 +- .../tools/scan-build/libexec/c++-analyzer.bat | 2 +- .../tools/scan-build/libexec/ccc-analyzer.bat | 2 +- clang/utils/ClangVisualizers/clang.natvis | 2178 ++--- .../test/Driver/msvc-dependent-lib-flags.f90 | 72 +- .../ir-interpreter-phi-nodes/Makefile | 8 +- .../postmortem/minidump/fizzbuzz.syms | 4 +- .../target-new-solib-notifications/Makefile | 46 +- .../target-new-solib-notifications/a.cpp | 6 +- .../target-new-solib-notifications/b.cpp | 2 +- .../target-new-solib-notifications/c.cpp | 2 +- .../target-new-solib-notifications/d.cpp | 2 +- .../target-new-solib-notifications/main.cpp | 32 +- .../unwind/zeroth_frame/Makefile | 6 +- .../unwind/zeroth_frame/TestZerothFrame.py | 176 +- lldb/test/API/python_api/debugger/Makefile | 6 +- lldb/test/Shell/BuildScript/modes.test | 70 +- lldb/test/Shell/BuildScript/script-args.test | 64 +- .../Shell/BuildScript/toolchain-clang-cl.test | 98 +- .../Windows/Sigsegv/Inputs/sigsegv.cpp | 80 +- .../NativePDB/Inputs/inline_sites.s | 1244 +-- .../Inputs/inline_sites_live.lldbinit | 14 +- .../Inputs/local-variables-registers.lldbinit | 70 +- .../NativePDB/Inputs/lookup-by-types.lldbinit | 6 +- .../subfield_register_simple_type.lldbinit | 4 +- .../NativePDB/function-types-classes.cpp | 12 +- .../NativePDB/inline_sites_live.cpp | 68 +- .../SymbolFile/NativePDB/lookup-by-types.cpp | 92 +- lldb/unittests/Breakpoint/CMakeLists.txt | 20 +- llvm/benchmarks/FormatVariadicBM.cpp | 126 +- .../GetIntrinsicForClangBuiltin.cpp | 100 +- .../GetIntrinsicInfoTableEntriesBM.cpp | 60 +- llvm/docs/_static/LoopOptWG_invite.ics | 160 +- llvm/lib/Support/rpmalloc/CACHE.md | 38 +- llvm/lib/Support/rpmalloc/README.md | 440 +- llvm/lib/Support/rpmalloc/malloc.c | 1448 +-- llvm/lib/Support/rpmalloc/rpmalloc.c | 7984 ++++++++--------- llvm/lib/Support/rpmalloc/rpmalloc.h | 856 +- llvm/lib/Support/rpmalloc/rpnew.h | 226 +- .../DirectX/DirectXTargetTransformInfo.cpp | 76 +- llvm/test/CodeGen/DirectX/atan2.ll | 174 +- llvm/test/CodeGen/DirectX/atan2_error.ll | 22 +- llvm/test/CodeGen/DirectX/cross.ll | 112 +- llvm/test/CodeGen/DirectX/normalize.ll | 224 +- llvm/test/CodeGen/DirectX/normalize_error.ll | 20 +- llvm/test/CodeGen/DirectX/step.ll | 156 +- .../CodeGen/SPIRV/hlsl-intrinsics/atan2.ll | 98 +- .../CodeGen/SPIRV/hlsl-intrinsics/cross.ll | 66 +- .../CodeGen/SPIRV/hlsl-intrinsics/length.ll | 58 +- .../SPIRV/hlsl-intrinsics/normalize.ll | 62 +- .../CodeGen/SPIRV/hlsl-intrinsics/step.ll | 66 +- .../Demangle/ms-placeholder-return-type.test | 36 +- llvm/test/FileCheck/dos-style-eol.txt | 20 +- llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri | 8 +- .../tools/llvm-cvtres/Inputs/languages.rc | 72 +- .../tools/llvm-cvtres/Inputs/test_resource.rc | 98 +- .../tools/llvm-rc/Inputs/dialog-with-menu.rc | 32 +- .../COFF/Inputs/resources/test_resource.rc | 88 +- llvm/unittests/Support/ModRefTest.cpp | 54 +- llvm/utils/LLVMVisualizers/llvm.natvis | 816 +- .../lit/tests/Inputs/shtest-shell/diff-in.dos | 6 +- llvm/utils/release/build_llvm_release.bat | 1030 +-- openmp/runtime/doc/doxygen/config | 3644 ++++---- pstl/CREDITS.txt | 42 +- 117 files changed, 14135 insertions(+), 14135 deletions(-) diff --git a/clang-tools-extra/clangd/test/input-mirror.test b/clang-tools-extra/clangd/test/input-mirror.test index bce3f9923a3b..a34a4a08cf60 100644 --- a/clang-tools-extra/clangd/test/input-mirror.test +++ b/clang-tools-extra/clangd/test/input-mirror.test @@ -1,17 +1,17 @@ -# RUN: clangd -pretty -sync -input-mirror-file %t < %s -# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file. -# RUN: diff -b %t %s -# It is absolutely vital that this file has CRLF line endings. -# -Content-Length: 125 - -{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} -Content-Length: 172 - -{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}} -Content-Length: 44 - -{"jsonrpc":"2.0","id":3,"method":"shutdown"} -Content-Length: 33 - -{"jsonrpc":"2.0","method":"exit"} +# RUN: clangd -pretty -sync -input-mirror-file %t < %s +# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file. +# RUN: diff -b %t %s +# It is absolutely vital that this file has CRLF line endings. +# +Content-Length: 125 + +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} +Content-Length: 172 + +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}} +Content-Length: 44 + +{"jsonrpc":"2.0","id":3,"method":"shutdown"} +Content-Length: 33 + +{"jsonrpc":"2.0","method":"exit"} diff --git a/clang-tools-extra/clangd/test/protocol.test b/clang-tools-extra/clangd/test/protocol.test index 64ccfaef1891..5e852d1d9dee 100644 --- a/clang-tools-extra/clangd/test/protocol.test +++ b/clang-tools-extra/clangd/test/protocol.test @@ -1,113 +1,113 @@ -# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s -# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s -# vim: fileformat=dos -# It is absolutely vital that this file has CRLF line endings. -# -# Note that we invert the test because we intent to let clangd exit prematurely. -# -# Test protocol parsing -Content-Length: 125 -Content-Type: application/vscode-jsonrpc; charset-utf-8 - -{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} -# Test message with Content-Type after Content-Length -# -# CHECK: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK: } -Content-Length: 246 - -{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n fake f;\n f.\n}\n"}}} - -Content-Length: 104 - -{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}} - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 146 - -{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with Content-Type before Content-Length -# -# CHECK: "id": 1, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } - -X-Test: Testing -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 146 -Content-Type: application/vscode-jsonrpc; charset-utf-8 -X-Testing: Test - -{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 10 -Content-Length: 146 - -{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with duplicate Content-Length headers -# -# CHECK: "id": 3, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } -# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored. - -Content-Type: application/vscode-jsonrpc; charset-utf-8 -Content-Length: 10 - -{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with malformed Content-Length -# -# STDERR: JSON parse error -# Ensure we recover by sending another (valid) message - -Content-Length: 146 - -{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message with Content-Type before Content-Length -# -# CHECK: "id": 5, -# CHECK-NEXT: "jsonrpc": "2.0", -# CHECK-NEXT: "result": { -# CHECK-NEXT: "isIncomplete": false, -# CHECK-NEXT: "items": [ -# CHECK: "filterText": "a", -# CHECK-NEXT: "insertText": "a", -# CHECK-NEXT: "insertTextFormat": 1, -# CHECK-NEXT: "kind": 5, -# CHECK-NEXT: "label": " a", -# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, -# CHECK-NEXT: "sortText": "{{.*}}" -# CHECK: ] -# CHECK-NEXT: } -Content-Length: 1024 - -{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} -# Test message which reads beyond the end of the stream. -# -# Ensure this is the last test in the file! -# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}. - +# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s +# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s +# vim: fileformat=dos +# It is absolutely vital that this file has CRLF line endings. +# +# Note that we invert the test because we intent to let clangd exit prematurely. +# +# Test protocol parsing +Content-Length: 125 +Content-Type: application/vscode-jsonrpc; charset-utf-8 + +{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}} +# Test message with Content-Type after Content-Length +# +# CHECK: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK: } +Content-Length: 246 + +{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n fake f;\n f.\n}\n"}}} + +Content-Length: 104 + +{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}} + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 146 + +{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with Content-Type before Content-Length +# +# CHECK: "id": 1, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } + +X-Test: Testing +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 146 +Content-Type: application/vscode-jsonrpc; charset-utf-8 +X-Testing: Test + +{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 10 +Content-Length: 146 + +{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with duplicate Content-Length headers +# +# CHECK: "id": 3, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } +# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored. + +Content-Type: application/vscode-jsonrpc; charset-utf-8 +Content-Length: 10 + +{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with malformed Content-Length +# +# STDERR: JSON parse error +# Ensure we recover by sending another (valid) message + +Content-Length: 146 + +{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message with Content-Type before Content-Length +# +# CHECK: "id": 5, +# CHECK-NEXT: "jsonrpc": "2.0", +# CHECK-NEXT: "result": { +# CHECK-NEXT: "isIncomplete": false, +# CHECK-NEXT: "items": [ +# CHECK: "filterText": "a", +# CHECK-NEXT: "insertText": "a", +# CHECK-NEXT: "insertTextFormat": 1, +# CHECK-NEXT: "kind": 5, +# CHECK-NEXT: "label": " a", +# CHECK-NEXT: "score": {{[0-9]+.[0-9]+}}, +# CHECK-NEXT: "sortText": "{{.*}}" +# CHECK: ] +# CHECK-NEXT: } +Content-Length: 1024 + +{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}} +# Test message which reads beyond the end of the stream. +# +# Ensure this is the last test in the file! +# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}. + diff --git a/clang-tools-extra/clangd/test/too_large.test b/clang-tools-extra/clangd/test/too_large.test index 6986bd5e258e..7df981e79420 100644 --- a/clang-tools-extra/clangd/test/too_large.test +++ b/clang-tools-extra/clangd/test/too_large.test @@ -1,7 +1,7 @@ -# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s -# vim: fileformat=dos -# It is absolutely vital that this file has CRLF line endings. -# -Content-Length: 2147483648 - -# STDERR: Refusing to read message +# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s +# vim: fileformat=dos +# It is absolutely vital that this file has CRLF line endings. +# +Content-Length: 2147483648 + +# STDERR: Refusing to read message diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl index 9c1630f6f570..030fcfc31691 100644 --- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -1,64 +1,64 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s - - -// This test tests two different AST generations. The "EMPTY" test mode verifies -// the AST generated by forward declaration of the HLSL types which happens on -// initializing the HLSL external AST with an AST Context. - -// The non-empty mode has a use that requires the StructuredBuffer type be complete, -// which results in the AST being populated by the external AST source. That -// case covers the full implementation of the template declaration and the -// instantiated specialization. - -// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer -// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type -// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer -// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final - -// There should be no more occurrances of StructuredBuffer -// EMPTY-NOT: StructuredBuffer - -#ifndef EMPTY - -StructuredBuffer Buffer; - -#endif - -// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer -// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type -// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition - -// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer - -// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' -// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' -// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} -// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this -// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline - -// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' -// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' -// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} -// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this -// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline - -// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition - -// CHECK: TemplateArgument type 'float' -// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' -// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s + + +// This test tests two different AST generations. The "EMPTY" test mode verifies +// the AST generated by forward declaration of the HLSL types which happens on +// initializing the HLSL external AST with an AST Context. + +// The non-empty mode has a use that requires the StructuredBuffer type be complete, +// which results in the AST being populated by the external AST source. That +// case covers the full implementation of the template declaration and the +// instantiated specialization. + +// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer +// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final + +// There should be no more occurrances of StructuredBuffer +// EMPTY-NOT: StructuredBuffer + +#ifndef EMPTY + +StructuredBuffer Buffer; + +#endif + +// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition + +// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer + +// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition + +// CHECK: TemplateArgument type 'float' +// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' +// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/C/C2y/n3262.c b/clang/test/C/C2y/n3262.c index 864ab351bdbc..3ff2062d88dd 100644 --- a/clang/test/C/C2y/n3262.c +++ b/clang/test/C/C2y/n3262.c @@ -1,20 +1,20 @@ -// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s -// expected-no-diagnostics - -/* WG14 N3262: Yes - * Usability of a byte-wise copy of va_list - * - * NB: Clang explicitly documents this as being undefined behavior. A - * diagnostic is produced for some targets but not for others for assignment or - * initialization, but no diagnostic is possible to produce for use with memcpy - * in the general case, nor with a manual bytewise copy via a for loop. - * - * Therefore, nothing is tested in this file; it serves as a reminder that we - * validated our documentation against the paper. See - * clang/docs/LanguageExtensions.rst for more details. - * - * FIXME: it would be nice to add ubsan support for recognizing when an invalid - * copy is made and diagnosing on copy (or on use of the copied va_list). - */ - -int main() {} +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s +// expected-no-diagnostics + +/* WG14 N3262: Yes + * Usability of a byte-wise copy of va_list + * + * NB: Clang explicitly documents this as being undefined behavior. A + * diagnostic is produced for some targets but not for others for assignment or + * initialization, but no diagnostic is possible to produce for use with memcpy + * in the general case, nor with a manual bytewise copy via a for loop. + * + * Therefore, nothing is tested in this file; it serves as a reminder that we + * validated our documentation against the paper. See + * clang/docs/LanguageExtensions.rst for more details. + * + * FIXME: it would be nice to add ubsan support for recognizing when an invalid + * copy is made and diagnosing on copy (or on use of the copied va_list). + */ + +int main() {} diff --git a/clang/test/C/C2y/n3274.c b/clang/test/C/C2y/n3274.c index 6bf8d72d0f33..ccdb89f4069d 100644 --- a/clang/test/C/C2y/n3274.c +++ b/clang/test/C/C2y/n3274.c @@ -1,18 +1,18 @@ -// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s -// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s - -/* WG14 N3274: Yes - * Remove imaginary types - */ - -// Clang has never supported _Imaginary. -#ifdef __STDC_IEC_559_COMPLEX__ -#error "When did this happen?" -#endif - -_Imaginary float i; // expected-error {{imaginary types are not supported}} - -// _Imaginary is a keyword in older language modes, but doesn't need to be one -// in C2y or later. However, to improve diagnostic behavior, we retain it as a -// keyword in all language modes -- it is not available as an identifier. -static_assert(!__is_identifier(_Imaginary)); +// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s + +/* WG14 N3274: Yes + * Remove imaginary types + */ + +// Clang has never supported _Imaginary. +#ifdef __STDC_IEC_559_COMPLEX__ +#error "When did this happen?" +#endif + +_Imaginary float i; // expected-error {{imaginary types are not supported}} + +// _Imaginary is a keyword in older language modes, but doesn't need to be one +// in C2y or later. However, to improve diagnostic behavior, we retain it as a +// keyword in all language modes -- it is not available as an identifier. +static_assert(!__is_identifier(_Imaginary)); diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl index 81c5837d8f20..4d3d4908c396 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s - -StructuredBuffer Buffer1; -StructuredBuffer > BufferArray[4]; - -StructuredBuffer Buffer2 : register(u3); -StructuredBuffer > BufferArray2[4] : register(u4); - -StructuredBuffer Buffer3 : register(u3, space1); -StructuredBuffer > BufferArray3[4] : register(u4, space1); - -[numthreads(1,1,1)] -void main() { -} - -// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s + +StructuredBuffer Buffer1; +StructuredBuffer > BufferArray[4]; + +StructuredBuffer Buffer2 : register(u3); +StructuredBuffer > BufferArray2[4] : register(u4); + +StructuredBuffer Buffer3 : register(u3, space1); +StructuredBuffer > BufferArray3[4] : register(u4, space1); + +[numthreads(1,1,1)] +void main() { +} + +// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} +// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl index 435a904327a2..a99c7f98a1af 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -1,70 +1,70 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s - -// NOTE: The number in type name and whether the struct is packed or not will mostly -// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) -// and theinterim field of the contained type is removed. - -// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0) -// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0) - -StructuredBuffer BufI16; -StructuredBuffer BufU16; -StructuredBuffer BufI32; -StructuredBuffer BufU32; -StructuredBuffer BufI64; -StructuredBuffer BufU64; -StructuredBuffer BufF16; -StructuredBuffer BufF32; -StructuredBuffer BufF64; -StructuredBuffer< vector > BufI16x4; -StructuredBuffer< vector > BufU32x3; -StructuredBuffer BufF16x2; -StructuredBuffer BufF32x3; -// TODO: StructuredBuffer BufSNormF16; -> 11 -// TODO: StructuredBuffer BufUNormF16; -> 12 -// TODO: StructuredBuffer BufSNormF32; -> 13 -// TODO: StructuredBuffer BufUNormF32; -> 14 -// TODO: StructuredBuffer BufSNormF64; -> 15 -// TODO: StructuredBuffer BufUNormF64; -> 16 - -[numthreads(1,1,1)] -void main(int GI : SV_GroupIndex) { - BufI16[GI] = 0; - BufU16[GI] = 0; - BufI32[GI] = 0; - BufU32[GI] = 0; - BufI64[GI] = 0; - BufU64[GI] = 0; - BufF16[GI] = 0; - BufF32[GI] = 0; - BufF64[GI] = 0; - BufI16x4[GI] = 0; - BufU32x3[GI] = 0; - BufF16x2[GI] = 0; - BufF32x3[GI] = 0; -} - -// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s + +// NOTE: The number in type name and whether the struct is packed or not will mostly +// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) +// and theinterim field of the contained type is removed. + +// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0) +// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0) + +StructuredBuffer BufI16; +StructuredBuffer BufU16; +StructuredBuffer BufI32; +StructuredBuffer BufU32; +StructuredBuffer BufI64; +StructuredBuffer BufU64; +StructuredBuffer BufF16; +StructuredBuffer BufF32; +StructuredBuffer BufF64; +StructuredBuffer< vector > BufI16x4; +StructuredBuffer< vector > BufU32x3; +StructuredBuffer BufF16x2; +StructuredBuffer BufF32x3; +// TODO: StructuredBuffer BufSNormF16; -> 11 +// TODO: StructuredBuffer BufUNormF16; -> 12 +// TODO: StructuredBuffer BufSNormF32; -> 13 +// TODO: StructuredBuffer BufUNormF32; -> 14 +// TODO: StructuredBuffer BufSNormF64; -> 15 +// TODO: StructuredBuffer BufUNormF64; -> 16 + +[numthreads(1,1,1)] +void main(int GI : SV_GroupIndex) { + BufI16[GI] = 0; + BufU16[GI] = 0; + BufI32[GI] = 0; + BufU32[GI] = 0; + BufI64[GI] = 0; + BufU64[GI] = 0; + BufF16[GI] = 0; + BufF32[GI] = 0; + BufF64[GI] = 0; + BufI16x4[GI] = 0; + BufU32x3[GI] = 0; + BufF16x2[GI] = 0; + BufF32x3[GI] = 0; +} + +// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, +// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, +// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, +// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl index 89bde9236288..155749ec4f94 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl @@ -1,17 +1,17 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s - -StructuredBuffer In; -StructuredBuffer Out; - -[numthreads(1,1,1)] -void main(unsigned GI : SV_GroupIndex) { - Out[GI] = In[GI]; -} - -// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy -// and confusing to follow so the match here is pretty weak. - -// CHECK: define void @main() -// Verify inlining leaves only calls to "llvm." intrinsics -// CHECK-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} -// CHECK: ret void +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s + +StructuredBuffer In; +StructuredBuffer Out; + +[numthreads(1,1,1)] +void main(unsigned GI : SV_GroupIndex) { + Out[GI] = In[GI]; +} + +// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy +// and confusing to follow so the match here is pretty weak. + +// CHECK: define void @main() +// Verify inlining leaves only calls to "llvm." intrinsics +// CHECK-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} +// CHECK: ret void diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl index ada269db2f00..40796052e608 100644 --- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl +++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl @@ -1,59 +1,59 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF - -// CHECK-LABEL: test_atan2_half -// NATIVE_HALF: call half @llvm.atan2.f16 -// NO_HALF: call float @llvm.atan2.f32 -half test_atan2_half (half p0, half p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half2 -// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16 -// NO_HALF: call <2 x float> @llvm.atan2.v2f32 -half2 test_atan2_half2 (half2 p0, half2 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half3 -// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16 -// NO_HALF: call <3 x float> @llvm.atan2.v3f32 -half3 test_atan2_half3 (half3 p0, half3 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_half4 -// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16 -// NO_HALF: call <4 x float> @llvm.atan2.v4f32 -half4 test_atan2_half4 (half4 p0, half4 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float -// CHECK: call float @llvm.atan2.f32 -float test_atan2_float (float p0, float p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float2 -// CHECK: call <2 x float> @llvm.atan2.v2f32 -float2 test_atan2_float2 (float2 p0, float2 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float3 -// CHECK: call <3 x float> @llvm.atan2.v3f32 -float3 test_atan2_float3 (float3 p0, float3 p1) { - return atan2(p0, p1); -} - -// CHECK-LABEL: test_atan2_float4 -// CHECK: call <4 x float> @llvm.atan2.v4f32 -float4 test_atan2_float4 (float4 p0, float4 p1) { - return atan2(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF + +// CHECK-LABEL: test_atan2_half +// NATIVE_HALF: call half @llvm.atan2.f16 +// NO_HALF: call float @llvm.atan2.f32 +half test_atan2_half (half p0, half p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half2 +// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16 +// NO_HALF: call <2 x float> @llvm.atan2.v2f32 +half2 test_atan2_half2 (half2 p0, half2 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half3 +// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16 +// NO_HALF: call <3 x float> @llvm.atan2.v3f32 +half3 test_atan2_half3 (half3 p0, half3 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_half4 +// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16 +// NO_HALF: call <4 x float> @llvm.atan2.v4f32 +half4 test_atan2_half4 (half4 p0, half4 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float +// CHECK: call float @llvm.atan2.f32 +float test_atan2_float (float p0, float p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float2 +// CHECK: call <2 x float> @llvm.atan2.v2f32 +float2 test_atan2_float2 (float2 p0, float2 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float3 +// CHECK: call <3 x float> @llvm.atan2.v3f32 +float3 test_atan2_float3 (float3 p0, float3 p1) { + return atan2(p0, p1); +} + +// CHECK-LABEL: test_atan2_float4 +// CHECK: call <4 x float> @llvm.atan2.v4f32 +float4 test_atan2_float4 (float4 p0, float4 p1) { + return atan2(p0, p1); +} diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl index eba710c905bf..514e57d36b20 100644 --- a/clang/test/CodeGenHLSL/builtins/cross.hlsl +++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl @@ -1,37 +1,37 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half> -// NATIVE_HALF: ret <3 x half> %hlsl.cross -// NO_HALF: define [[FNATTRS]] <3 x float> @ -// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float> -// NO_HALF: ret <3 x float> %hlsl.cross -half3 test_cross_half3(half3 p0, half3 p1) -{ - return cross(p0, p1); -} - -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32( -// CHECK: ret <3 x float> %hlsl.cross -float3 test_cross_float3(float3 p0, float3 p1) -{ - return cross(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half> +// NATIVE_HALF: ret <3 x half> %hlsl.cross +// NO_HALF: define [[FNATTRS]] <3 x float> @ +// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float> +// NO_HALF: ret <3 x float> %hlsl.cross +half3 test_cross_half3(half3 p0, half3 p1) +{ + return cross(p0, p1); +} + +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32( +// CHECK: ret <3 x float> %hlsl.cross +float3 test_cross_float3(float3 p0, float3 p1) +{ + return cross(p0, p1); +} diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl index 9b0293c218a5..1c23b0df04df 100644 --- a/clang/test/CodeGenHLSL/builtins/length.hlsl +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl @@ -1,73 +1,73 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF - -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: call half @llvm.fabs.f16(half -// NO_HALF: call float @llvm.fabs.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_length_half(half p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half2(half2 p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half3(half3 p0) -{ - return length(p0); -} -// NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16 -// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32( -// NATIVE_HALF: ret half %hlsl.length -// NO_HALF: ret float %hlsl.length -half test_length_half4(half4 p0) -{ - return length(p0); -} - -// CHECK: define noundef float @ -// CHECK: call float @llvm.fabs.f32(float -// CHECK: ret float -float test_length_float(float p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32( -// CHECK: ret float %hlsl.length -float test_length_float2(float2 p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32( -// CHECK: ret float %hlsl.length -float test_length_float3(float3 p0) -{ - return length(p0); -} -// CHECK: define noundef float @ -// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32( -// CHECK: ret float %hlsl.length -float test_length_float4(float4 p0) -{ - return length(p0); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF + +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: call half @llvm.fabs.f16(half +// NO_HALF: call float @llvm.fabs.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_length_half(half p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half2(half2 p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half3(half3 p0) +{ + return length(p0); +} +// NATIVE_HALF: define noundef half @ +// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16 +// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32( +// NATIVE_HALF: ret half %hlsl.length +// NO_HALF: ret float %hlsl.length +half test_length_half4(half4 p0) +{ + return length(p0); +} + +// CHECK: define noundef float @ +// CHECK: call float @llvm.fabs.f32(float +// CHECK: ret float +float test_length_float(float p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32( +// CHECK: ret float %hlsl.length +float test_length_float2(float2 p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32( +// CHECK: ret float %hlsl.length +float test_length_float3(float3 p0) +{ + return length(p0); +} +// CHECK: define noundef float @ +// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32( +// CHECK: ret float %hlsl.length +float test_length_float4(float4 p0) +{ + return length(p0); +} diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl index d14e7c70ce06..83ad607c14a6 100644 --- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl +++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl @@ -1,85 +1,85 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] half @ -// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half -// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_normalize_half(half p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ -// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half> -// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> -// NATIVE_HALF: ret <2 x half> %hlsl.normalize -// NO_HALF: ret <2 x float> %hlsl.normalize -half2 test_normalize_half2(half2 p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half> -// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float> -// NATIVE_HALF: ret <3 x half> %hlsl.normalize -// NO_HALF: ret <3 x float> %hlsl.normalize -half3 test_normalize_half3(half3 p0) -{ - return normalize(p0); -} -// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ -// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half> -// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float> -// NATIVE_HALF: ret <4 x half> %hlsl.normalize -// NO_HALF: ret <4 x float> %hlsl.normalize -half4 test_normalize_half4(half4 p0) -{ - return normalize(p0); -} - -// CHECK: define [[FNATTRS]] float @ -// CHECK: call float @llvm.[[TARGET]].normalize.f32(float -// CHECK: ret float -float test_normalize_float(float p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <2 x float> @ -// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> - -// CHECK: ret <2 x float> %hlsl.normalize -float2 test_normalize_float2(float2 p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32( -// CHECK: ret <3 x float> %hlsl.normalize -float3 test_normalize_float3(float3 p0) -{ - return normalize(p0); -} -// CHECK: define [[FNATTRS]] <4 x float> @ -// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32( -// CHECK: ret <4 x float> %hlsl.normalize -float4 test_length_float4(float4 p0) -{ - return normalize(p0); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] half @ +// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half +// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_normalize_half(half p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ +// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half> +// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> +// NATIVE_HALF: ret <2 x half> %hlsl.normalize +// NO_HALF: ret <2 x float> %hlsl.normalize +half2 test_normalize_half2(half2 p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half> +// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float> +// NATIVE_HALF: ret <3 x half> %hlsl.normalize +// NO_HALF: ret <3 x float> %hlsl.normalize +half3 test_normalize_half3(half3 p0) +{ + return normalize(p0); +} +// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ +// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half> +// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float> +// NATIVE_HALF: ret <4 x half> %hlsl.normalize +// NO_HALF: ret <4 x float> %hlsl.normalize +half4 test_normalize_half4(half4 p0) +{ + return normalize(p0); +} + +// CHECK: define [[FNATTRS]] float @ +// CHECK: call float @llvm.[[TARGET]].normalize.f32(float +// CHECK: ret float +float test_normalize_float(float p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float> + +// CHECK: ret <2 x float> %hlsl.normalize +float2 test_normalize_float2(float2 p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32( +// CHECK: ret <3 x float> %hlsl.normalize +float3 test_normalize_float3(float3 p0) +{ + return normalize(p0); +} +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32( +// CHECK: ret <4 x float> %hlsl.normalize +float4 test_length_float4(float4 p0) +{ + return normalize(p0); +} diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl index 8ef52794a3be..442f4930ca57 100644 --- a/clang/test/CodeGenHLSL/builtins/step.hlsl +++ b/clang/test/CodeGenHLSL/builtins/step.hlsl @@ -1,84 +1,84 @@ -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS=noundef -DTARGET=dx -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ -// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv -// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ -// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ -// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv - -// NATIVE_HALF: define [[FNATTRS]] half @ -// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half -// NO_HALF: call float @llvm.[[TARGET]].step.f32(float -// NATIVE_HALF: ret half -// NO_HALF: ret float -half test_step_half(half p0, half p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ -// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half> -// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float> -// NATIVE_HALF: ret <2 x half> %hlsl.step -// NO_HALF: ret <2 x float> %hlsl.step -half2 test_step_half2(half2 p0, half2 p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ -// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half> -// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float> -// NATIVE_HALF: ret <3 x half> %hlsl.step -// NO_HALF: ret <3 x float> %hlsl.step -half3 test_step_half3(half3 p0, half3 p1) -{ - return step(p0, p1); -} -// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ -// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half> -// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float> -// NATIVE_HALF: ret <4 x half> %hlsl.step -// NO_HALF: ret <4 x float> %hlsl.step -half4 test_step_half4(half4 p0, half4 p1) -{ - return step(p0, p1); -} - -// CHECK: define [[FNATTRS]] float @ -// CHECK: call float @llvm.[[TARGET]].step.f32(float -// CHECK: ret float -float test_step_float(float p0, float p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <2 x float> @ -// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32( -// CHECK: ret <2 x float> %hlsl.step -float2 test_step_float2(float2 p0, float2 p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <3 x float> @ -// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32( -// CHECK: ret <3 x float> %hlsl.step -float3 test_step_float3(float3 p0, float3 p1) -{ - return step(p0, p1); -} -// CHECK: define [[FNATTRS]] <4 x float> @ -// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32( -// CHECK: ret <4 x float> %hlsl.step -float4 test_step_float4(float4 p0, float4 p1) -{ - return step(p0, p1); -} +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS=noundef -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \ +// RUN: -DFNATTRS="spir_func noundef" -DTARGET=spv + +// NATIVE_HALF: define [[FNATTRS]] half @ +// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half +// NO_HALF: call float @llvm.[[TARGET]].step.f32(float +// NATIVE_HALF: ret half +// NO_HALF: ret float +half test_step_half(half p0, half p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <2 x half> @ +// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half> +// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float> +// NATIVE_HALF: ret <2 x half> %hlsl.step +// NO_HALF: ret <2 x float> %hlsl.step +half2 test_step_half2(half2 p0, half2 p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <3 x half> @ +// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half> +// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float> +// NATIVE_HALF: ret <3 x half> %hlsl.step +// NO_HALF: ret <3 x float> %hlsl.step +half3 test_step_half3(half3 p0, half3 p1) +{ + return step(p0, p1); +} +// NATIVE_HALF: define [[FNATTRS]] <4 x half> @ +// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half> +// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float> +// NATIVE_HALF: ret <4 x half> %hlsl.step +// NO_HALF: ret <4 x float> %hlsl.step +half4 test_step_half4(half4 p0, half4 p1) +{ + return step(p0, p1); +} + +// CHECK: define [[FNATTRS]] float @ +// CHECK: call float @llvm.[[TARGET]].step.f32(float +// CHECK: ret float +float test_step_float(float p0, float p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <2 x float> @ +// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32( +// CHECK: ret <2 x float> %hlsl.step +float2 test_step_float2(float2 p0, float2 p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <3 x float> @ +// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32( +// CHECK: ret <3 x float> %hlsl.step +float3 test_step_float3(float3 p0, float3 p1) +{ + return step(p0, p1); +} +// CHECK: define [[FNATTRS]] <4 x float> @ +// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32( +// CHECK: ret <4 x float> %hlsl.step +float4 test_step_float4(float4 p0, float4 p1) +{ + return step(p0, p1); +} diff --git a/clang/test/Driver/flang/msvc-link.f90 b/clang/test/Driver/flang/msvc-link.f90 index 3f7e162a9a61..463749510eb5 100644 --- a/clang/test/Driver/flang/msvc-link.f90 +++ b/clang/test/Driver/flang/msvc-link.f90 @@ -1,5 +1,5 @@ -! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s -! -! Test that user provided paths come before the Flang runtimes -! CHECK: "-libpath:test" -! CHECK: "-libpath:{{.*(\\|/)}}lib" +! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s +! +! Test that user provided paths come before the Flang runtimes +! CHECK: "-libpath:test" +! CHECK: "-libpath:{{.*(\\|/)}}lib" diff --git a/clang/test/FixIt/fixit-newline-style.c b/clang/test/FixIt/fixit-newline-style.c index 2aac143d4d75..61e4df67e85b 100644 --- a/clang/test/FixIt/fixit-newline-style.c +++ b/clang/test/FixIt/fixit-newline-style.c @@ -1,11 +1,11 @@ -// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace - -// This file intentionally uses a CRLF newline style -// CHECK: warning: unused label 'ddd' -// CHECK-NEXT: {{^ ddd:}} -// CHECK-NEXT: {{^ \^~~~$}} -// CHECK-NOT: {{^ ;}} -void f(void) { - ddd: - ; -} +// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace + +// This file intentionally uses a CRLF newline style +// CHECK: warning: unused label 'ddd' +// CHECK-NEXT: {{^ ddd:}} +// CHECK-NEXT: {{^ \^~~~$}} +// CHECK-NOT: {{^ ;}} +void f(void) { + ddd: + ; +} diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c index 2faeaba32292..d6724444c066 100644 --- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c +++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c @@ -1,8 +1,8 @@ -// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 - -// expected-no-diagnostics -// Note: This source file has CRLF line endings. -// This test validates that -frewrite-includes translates the end of line (EOL) -// form used in header files to the EOL form used in the the primary source -// file when the files use different EOL forms. -#include "rewrite-includes-mixed-eol-crlf.h" -#include "rewrite-includes-mixed-eol-lf.h" +// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 - +// expected-no-diagnostics +// Note: This source file has CRLF line endings. +// This test validates that -frewrite-includes translates the end of line (EOL) +// form used in header files to the EOL form used in the the primary source +// file when the files use different EOL forms. +#include "rewrite-includes-mixed-eol-crlf.h" +#include "rewrite-includes-mixed-eol-lf.h" diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h index baedc282296b..0439b88b75e2 100644 --- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h +++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h @@ -1,11 +1,11 @@ -// Note: This header file has CRLF line endings. -// The indentation in some of the conditional inclusion directives below is -// intentional and is required for this test to function as a regression test -// for GH59736. -_Static_assert(__LINE__ == 5, ""); -#if 1 -_Static_assert(__LINE__ == 7, ""); - #if 1 - _Static_assert(__LINE__ == 9, ""); - #endif -#endif +// Note: This header file has CRLF line endings. +// The indentation in some of the conditional inclusion directives below is +// intentional and is required for this test to function as a regression test +// for GH59736. +_Static_assert(__LINE__ == 5, ""); +#if 1 +_Static_assert(__LINE__ == 7, ""); + #if 1 + _Static_assert(__LINE__ == 9, ""); + #endif +#endif diff --git a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c index dffdd5cf1959..92fc07f65e0d 100644 --- a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c +++ b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c @@ -1,21 +1,21 @@ -// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s -#include -#include - -#include "line-directive.h" - -// This tests that the line numbers for the current file are correctly outputted -// for the include-file-completed test case. This file should be CRLF. - -// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}noline.h" 1 3 -// CHECK: foo(void); -// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3 -// The "3" below indicates that "foo.h" is considered a system header. -// CHECK: # 1 "foo.h" 3 -// CHECK: foo(void); -// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 -// CHECK: # 1 "{{.*}}line-directive.h" 1 -// CHECK: # 10 "foo.h"{{$}} -// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s +#include +#include + +#include "line-directive.h" + +// This tests that the line numbers for the current file are correctly outputted +// for the include-file-completed test case. This file should be CRLF. + +// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}noline.h" 1 3 +// CHECK: foo(void); +// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3 +// The "3" below indicates that "foo.h" is considered a system header. +// CHECK: # 1 "foo.h" 3 +// CHECK: foo(void); +// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 +// CHECK: # 1 "{{.*}}line-directive.h" 1 +// CHECK: # 10 "foo.h"{{$}} +// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2 diff --git a/clang/test/ParserHLSL/bitfields.hlsl b/clang/test/ParserHLSL/bitfields.hlsl index 57b6705babdc..307d1143a068 100644 --- a/clang/test/ParserHLSL/bitfields.hlsl +++ b/clang/test/ParserHLSL/bitfields.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s - - -struct MyBitFields { - // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field1 'unsigned int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 3 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 3 - unsigned int field1 : 3; // 3 bits for field1 - - // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field2 'unsigned int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 4 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 4 - unsigned int field2 : 4; // 4 bits for field2 - - // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:7 field3 'int' - // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' - // CHECK:-value: Int 5 - // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 5 - int field3 : 5; // 5 bits for field3 (signed) -}; - - - -[numthreads(1,1,1)] -void main() { - MyBitFields m; - m.field1 = 4; - m.field2 = m.field1*2; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s + + +struct MyBitFields { + // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field1 'unsigned int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 3 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 3 + unsigned int field1 : 3; // 3 bits for field1 + + // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:16 referenced field2 'unsigned int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 4 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 4 + unsigned int field2 : 4; // 4 bits for field2 + + // CHECK:FieldDecl 0x{{[0-9a-f]+}} col:7 field3 'int' + // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} 'int' + // CHECK:-value: Int 5 + // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} 'int' 5 + int field3 : 5; // 5 bits for field3 (signed) +}; + + + +[numthreads(1,1,1)] +void main() { + MyBitFields m; + m.field1 = 4; + m.field2 = m.field1*2; } \ No newline at end of file diff --git a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl index 5b228d039345..2eebc920388b 100644 --- a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl +++ b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl @@ -1,21 +1,21 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// tests that hlsl annotations are properly parsed when applied on field decls, -// and that the annotation gets properly placed on the AST. - -struct Eg9{ - // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:8 implicit struct Eg9 - // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced a 'unsigned int' - // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} - unsigned int a : SV_DispatchThreadID; -}; -Eg9 e9; - - -RWBuffer In : register(u1); - - -[numthreads(1,1,1)] -void main() { - In[0] = e9.a; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// tests that hlsl annotations are properly parsed when applied on field decls, +// and that the annotation gets properly placed on the AST. + +struct Eg9{ + // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:8 implicit struct Eg9 + // CHECK: FieldDecl 0x{{[0-9a-f]+}} col:16 referenced a 'unsigned int' + // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} + unsigned int a : SV_DispatchThreadID; +}; +Eg9 e9; + + +RWBuffer In : register(u1); + + +[numthreads(1,1,1)] +void main() { + In[0] = e9.a; +} diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl index 476ec39e14da..5a72aa242e58 100644 --- a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl @@ -1,25 +1,25 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s - -typedef vector float4; - -// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} -// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] -using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]]; -ResourceIntAliasT h1; - -// CHECK: -VarDecl 0x{{[0-9a-f]+}} col:82 h2 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]] -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2; - -// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:30 S -// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:20 referenced typename depth 0 index 0 T -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:30 struct S definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:79 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]] -template struct S { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h; -}; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s + +typedef vector float4; + +// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} +// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] +using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]]; +ResourceIntAliasT h1; + +// CHECK: -VarDecl 0x{{[0-9a-f]+}} col:82 h2 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]] +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2; + +// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:30 S +// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:20 referenced typename depth 0 index 0 T +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:30 struct S definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:79 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]] +template struct S { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h; +}; diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl index 673ff8693b83..b2d492d95945 100644 --- a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl @@ -1,28 +1,28 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify - -typedef vector float4; - -// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}} -[[hlsl::contained_type(float4)]] __hlsl_resource_t h1; - -// expected-error@+1{{'contained_type' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3; - -// expected-error@+1{{expected a type}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4; - -// expected-error@+1{{unknown type name 'a'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5; - -// expected-error@+1{{expected a type}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6; - -// expected-warning@+1{{attribute 'contained_type' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7; - -// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify + +typedef vector float4; + +// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}} +[[hlsl::contained_type(float4)]] __hlsl_resource_t h1; + +// expected-error@+1{{'contained_type' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3; + +// expected-error@+1{{expected a type}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4; + +// expected-error@+1{{unknown type name 'a'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5; + +// expected-error@+1{{expected a type}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6; + +// expected-warning@+1{{attribute 'contained_type' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7; + +// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5; diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl index 487dc3241303..836d129c8d00 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -void f() { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; +} diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl index 9bb64ea990e2..3b2c12e7a96c 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -1,20 +1,20 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} -[[hlsl::is_rov]] __hlsl_resource_t res0; - -// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} -__hlsl_resource_t [[hlsl::is_rov]] res1; - -// expected-error@+1{{'is_rov' attribute takes no arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; - -// expected-error@+1{{use of undeclared identifier 'gibberish'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; - -// expected-warning@+1{{attribute 'is_rov' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} +[[hlsl::is_rov]] __hlsl_resource_t res0; + +// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} +__hlsl_resource_t [[hlsl::is_rov]] res1; + +// expected-error@+1{{'is_rov' attribute takes no arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; + +// expected-error@+1{{use of undeclared identifier 'gibberish'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; + +// expected-warning@+1{{attribute 'is_rov' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5; diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl index e09ed5586c10..84c924eec24e 100644 --- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:72 h1 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:70 h2 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 h3 '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] -void f() { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:72 h1 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:70 h2 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 h3 '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3; +} diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl index a10aca4e96fc..77530cbf9e4d 100644 --- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl @@ -1,17 +1,17 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}} -[[hlsl::raw_buffer]] __hlsl_resource_t res0; - -// expected-error@+1{{'raw_buffer' attribute takes no arguments}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2; - -// expected-error@+1{{use of undeclared identifier 'gibberish'}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3; - -// expected-warning@+1{{attribute 'raw_buffer' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4; - -// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}} +[[hlsl::raw_buffer]] __hlsl_resource_t res0; + +// expected-error@+1{{'raw_buffer' attribute takes no arguments}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2; + +// expected-error@+1{{use of undeclared identifier 'gibberish'}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3; + +// expected-warning@+1{{attribute 'raw_buffer' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4; + +// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl index 9fee9edddf61..fbada8b4b99f 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl @@ -1,37 +1,37 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -struct MyBuffer { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; -}; - -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:49 res '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] -__hlsl_resource_t [[hlsl::resource_class(SRV)]] res; - -// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 3]]:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:55 r '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] -void f() { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r; -} - -// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:29 MyBuffer2 -// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:19 typename depth 0 index 0 T -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:29 struct MyBuffer2 definition -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -template struct MyBuffer2 { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; -}; - -// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation -// CHECK: TemplateArgument type 'float' -// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -MyBuffer2 myBuffer2; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +struct MyBuffer { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; +}; + +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:49 res '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] +__hlsl_resource_t [[hlsl::resource_class(SRV)]] res; + +// CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 3]]:6 f 'void () +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:55 r '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]] +void f() { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r; +} + +// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 6]]:29 MyBuffer2 +// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} col:19 typename depth 0 index 0 T +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} line:[[# @LINE + 4]]:29 struct MyBuffer2 definition +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +template struct MyBuffer2 { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] h; +}; + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation +// CHECK: TemplateArgument type 'float' +// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} col:29 implicit struct MyBuffer2 +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:51 h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +MyBuffer2 myBuffer2; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl index a0a4da1dc2bf..63e39daff949 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl @@ -1,22 +1,22 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify - -// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}} -[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0; - -// expected-error@+1{{'resource_class' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class()]] e1; - -// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} -__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2; - -// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}} -__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3; - -// expected-warning@+1{{attribute 'resource_class' is already applied}} -__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4; - -// expected-error@+1{{'resource_class' attribute takes one argument}} -__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5; - -// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} -float [[hlsl::resource_class(UAV)]] e6; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify + +// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}} +[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0; + +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class()]] e1; + +// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} +__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2; + +// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3; + +// expected-warning@+1{{attribute 'resource_class' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4; + +// expected-error@+1{{'resource_class' attribute takes one argument}} +__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5; + +// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}} +float [[hlsl::resource_class(UAV)]] e6; diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index 8885e3923735..38d27bc21e4a 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -1,21 +1,21 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s - -// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation -// CHECK: -TemplateArgument type 'float' -// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -RWBuffer Buffer1; - -// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RasterizerOrderedBuffer definition implicit_instantiation -// CHECK: -TemplateArgument type 'vector' -// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 -// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t -// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)] -// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] -// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector)]] -// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -RasterizerOrderedBuffer > BufferArray3[4]; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s + +// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation +// CHECK: -TemplateArgument type 'float' +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] +// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer +RWBuffer Buffer1; + +// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RasterizerOrderedBuffer definition implicit_instantiation +// CHECK: -TemplateArgument type 'vector' +// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 +// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)] +// CHECK-SAME{LITERAL}: [[hlsl::is_rov]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector)]] +// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer +RasterizerOrderedBuffer > BufferArray3[4]; diff --git a/clang/test/Sema/aarch64-sve-vector-trig-ops.c b/clang/test/Sema/aarch64-sve-vector-trig-ops.c index f853abcd3379..3fe6834be2e0 100644 --- a/clang/test/Sema/aarch64-sve-vector-trig-ops.c +++ b/clang/test/Sema/aarch64-sve-vector-trig-ops.c @@ -1,65 +1,65 @@ -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \ -// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify -// REQUIRES: aarch64-registered-target - -#include - -svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_asin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_acos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_atan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_atan2(v, v); - // expected-error@-1 {{1st argument must be a floating point type}} -} - -svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_sin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_cos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_tan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_sinh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_cosh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) { - - return __builtin_elementwise_tanh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \ +// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify +// REQUIRES: aarch64-registered-target + +#include + +svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_asin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_acos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_atan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_atan2(v, v); + // expected-error@-1 {{1st argument must be a floating point type}} +} + +svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_sin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_cos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_tan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_sinh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_cosh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) { + + return __builtin_elementwise_tanh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} diff --git a/clang/test/Sema/riscv-rvv-vector-trig-ops.c b/clang/test/Sema/riscv-rvv-vector-trig-ops.c index 006c136f8033..0aed1b2a0998 100644 --- a/clang/test/Sema/riscv-rvv-vector-trig-ops.c +++ b/clang/test/Sema/riscv-rvv-vector-trig-ops.c @@ -1,67 +1,67 @@ -// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \ -// RUN: -target-feature +v -target-feature +zfh -target-feature +zvfh \ -// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify -// REQUIRES: riscv-registered-target - -#include - -vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_asin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_acos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_atan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - -vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_atan2(v, v); - // expected-error@-1 {{1st argument must be a floating point type}} -} - -vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_sin(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_cos(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_tan(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} -} - -vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_sinh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_cosh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - - vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) { - - return __builtin_elementwise_tanh(v); - // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} - } - +// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \ +// RUN: -target-feature +v -target-feature +zfh -target-feature +zvfh \ +// RUN: -disable-O0-optnone -o - -fsyntax-only %s -verify +// REQUIRES: riscv-registered-target + +#include + +vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_asin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_acos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_atan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + +vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_atan2(v, v); + // expected-error@-1 {{1st argument must be a floating point type}} +} + +vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_sin(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_cos(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_tan(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} +} + +vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_sinh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_cosh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + + vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) { + + return __builtin_elementwise_tanh(v); + // expected-error@-1 {{1st argument must be a vector, integer or floating point type}} + } + diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl index b60fba62bdb0..764b9e843f7f 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl @@ -1,119 +1,119 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl index 35b7c384f26c..6bfc8577670c 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl @@ -1,180 +1,180 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -namespace A { - namespace B { - export { - void exportedFunctionInNS(float x) { - // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(x); // #exportedFunctionInNS_fx_call - - // API with shader-stage-specific availability in exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(x); - float C = fz(x); - } - } - } -} - -// Shader entry point without body -[shader("compute")] -[numthreads(4,1,1)] -float main(); - -// Shader entry point with body -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +namespace A { + namespace B { + export { + void exportedFunctionInNS(float x) { + // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(x); // #exportedFunctionInNS_fx_call + + // API with shader-stage-specific availability in exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(x); + float C = fz(x); + } + } + } +} + +// Shader entry point without body +[shader("compute")] +[numthreads(4,1,1)] +float main(); + +// Shader entry point with body +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl index 406879838393..65836c55821d 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl @@ -1,119 +1,119 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-warning@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-warning@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl index a23e91a546b1..4c9783138f67 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl @@ -1,162 +1,162 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-warning@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return 0; -} - -float dead(float f) { - // unreachable code - no errors expected - float A = fx(f); - float B = fy(f); - float C = fz(f); - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); -} - -float test(float x) { - return aliveTemp2(x); -} - -class MyClass -{ - float F; - float makeF() { - // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - return 0; - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -// Shader entry point without body -[shader("compute")] -[numthreads(4,1,1)] -float main(); - -// Shader entry point with body -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-warning@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-warning@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return 0; +} + +float dead(float f) { + // unreachable code - no errors expected + float A = fx(f); + float B = fy(f); + float C = fz(f); + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); +} + +float test(float x) { + return aliveTemp2(x); +} + +class MyClass +{ + float F; + float makeF() { + // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + return 0; + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +// Shader entry point without body +[shader("compute")] +[numthreads(4,1,1)] +float main(); + +// Shader entry point with body +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl index a8783c10cbab..b67e10c9a901 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl @@ -1,129 +1,129 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ -// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_dead_fx_call - // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_dead_fy_call - // expected-error@#also_dead_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_dead_fz_call - return 0; -} - -float dead(float f) { - // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #dead_fx_call - // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #dead_fy_call - // expected-error@#dead_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #dead_fz_call - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -float test(float x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - } -}; - -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f); - float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - return a * b * c; +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \ +// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_dead_fx_call + // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_dead_fy_call + // expected-error@#also_dead_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_dead_fz_call + return 0; +} + +float dead(float f) { + // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #dead_fx_call + // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #dead_fy_call + // expected-error@#dead_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #dead_fz_call + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +float test(float x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + } +}; + +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f); + float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + return a * b * c; } \ No newline at end of file diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl index 0fffbc96dac1..c7be5afbc2d2 100644 --- a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl @@ -1,192 +1,192 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 6.6))) -half fx(half); // #fx_half - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) -float fz(float); // #fz - -// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default -// diagnostic mode is implemented in a future PR which will verify calls in -// all functions that are reachable from the shader library entry points - -float also_alive(float f) { - // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_alive_fx_call - - // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #also_alive_fy_call - - // expected-error@#also_alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #also_alive_fz_call - - return 0; -} - -float alive(float f) { - // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #alive_fx_call - - // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #alive_fy_call - - // expected-error@#alive_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #alive_fz_call - - return also_alive(f); -} - -float also_dead(float f) { - // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #also_dead_fx_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float B = fy(f); // #also_dead_fy_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float C = fz(f); // #also_dead_fz_call - return 0; -} - -float dead(float f) { - // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #dead_fx_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float B = fy(f); // #dead_fy_call - - // Call to environment-specific function from an unreachable function - // in a shader library - no diagnostic expected. - float C = fz(f); // #dead_fz_call - - return also_dead(f); -} - -template -T aliveTemp(T f) { - // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #aliveTemp_fx_call - // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #aliveTemp_fy_call - // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #aliveTemp_fz_call - return 0; -} - -template T aliveTemp2(T f) { - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} - // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} - // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - return fx(f); // #aliveTemp2_fx_call -} - -half test(half x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -float test(float x) { - return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} -} - -class MyClass -{ - float F; - float makeF() { - // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(F); // #MyClass_makeF_fx_call - // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(F); // #MyClass_makeF_fy_call - // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(F); // #MyClass_makeF_fz_call - } -}; - -// Exported function without body, not used -export void exportedFunctionUnused(float f); - -// Exported function with body, without export, not used -void exportedFunctionUnused(float f) { - // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUnused_fx_call - - // API with shader-stage-specific availability in unused exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(f); - float C = fz(f); -} - -// Exported function with body - called from main() which is a compute shader entry point -export void exportedFunctionUsed(float f) { - // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #exportedFunctionUsed_fx_call - - // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #exportedFunctionUsed_fy_call - - // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} - float C = fz(f); // #exportedFunctionUsed_fz_call -} - -namespace A { - namespace B { - export { - void exportedFunctionInNS(float x) { - // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(x); // #exportedFunctionInNS_fx_call - - // API with shader-stage-specific availability in exported library function - // - no errors expected because the actual shader stage this function - // will be used in not known at this time - float B = fy(x); - float C = fz(x); - } - } - } -} - -[shader("compute")] -[numthreads(4,1,1)] -float main() { - float f = 3; - MyClass C = { 1.0f }; - float a = alive(f);float b = aliveTemp(f); // #aliveTemp_inst - float c = C.makeF(); - float d = test((float)1.0); - float e = test((half)1.0); - exportedFunctionUsed(1.0f); - return a * b * c; -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 6.6))) +half fx(half); // #fx_half + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh))) +float fz(float); // #fz + +// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default +// diagnostic mode is implemented in a future PR which will verify calls in +// all functions that are reachable from the shader library entry points + +float also_alive(float f) { + // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_alive_fx_call + + // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #also_alive_fy_call + + // expected-error@#also_alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #also_alive_fz_call + + return 0; +} + +float alive(float f) { + // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #alive_fx_call + + // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #alive_fy_call + + // expected-error@#alive_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #alive_fz_call + + return also_alive(f); +} + +float also_dead(float f) { + // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #also_dead_fx_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float B = fy(f); // #also_dead_fy_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float C = fz(f); // #also_dead_fz_call + return 0; +} + +float dead(float f) { + // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #dead_fx_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float B = fy(f); // #dead_fy_call + + // Call to environment-specific function from an unreachable function + // in a shader library - no diagnostic expected. + float C = fz(f); // #dead_fz_call + + return also_dead(f); +} + +template +T aliveTemp(T f) { + // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp' requested here}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #aliveTemp_fx_call + // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #aliveTemp_fy_call + // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #aliveTemp_fz_call + return 0; +} + +template T aliveTemp2(T f) { + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}} + // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}} + // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + return fx(f); // #aliveTemp2_fx_call +} + +half test(half x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +float test(float x) { + return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2' requested here}} +} + +class MyClass +{ + float F; + float makeF() { + // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(F); // #MyClass_makeF_fx_call + // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(F); // #MyClass_makeF_fy_call + // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(F); // #MyClass_makeF_fz_call + } +}; + +// Exported function without body, not used +export void exportedFunctionUnused(float f); + +// Exported function with body, without export, not used +void exportedFunctionUnused(float f) { + // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUnused_fx_call + + // API with shader-stage-specific availability in unused exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(f); + float C = fz(f); +} + +// Exported function with body - called from main() which is a compute shader entry point +export void exportedFunctionUsed(float f) { + // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #exportedFunctionUsed_fx_call + + // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #exportedFunctionUsed_fy_call + + // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}} + float C = fz(f); // #exportedFunctionUsed_fz_call +} + +namespace A { + namespace B { + export { + void exportedFunctionInNS(float x) { + // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(x); // #exportedFunctionInNS_fx_call + + // API with shader-stage-specific availability in exported library function + // - no errors expected because the actual shader stage this function + // will be used in not known at this time + float B = fy(x); + float C = fz(x); + } + } + } +} + +[shader("compute")] +[numthreads(4,1,1)] +float main() { + float f = 3; + MyClass C = { 1.0f }; + float a = alive(f);float b = aliveTemp(f); // #aliveTemp_inst + float c = C.makeF(); + float d = test((float)1.0); + float e = test((half)1.0); + exportedFunctionUsed(1.0f); + return a * b * c; +} diff --git a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl index bfefc9b116a6..b56ab8fe4526 100644 --- a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl +++ b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl @@ -1,57 +1,57 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ -// RUN: -fsyntax-only -verify %s - -__attribute__((availability(shadermodel, introduced = 6.5))) -float fx(float); // #fx - -__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) -__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) -float fy(float); // #fy - -__attribute__((availability(shadermodel, introduced = 5.0, environment = compute))) -float fz(float); // #fz - - -void F(float f) { - // Make sure we only get this error once, even though this function is scanned twice - once - // in compute shader context and once in pixel shader context. - // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}} - // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} - float A = fx(f); // #fx_call - - // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} - // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} - float B = fy(f); // #fy_call - - // expected-error@#fz_call {{'fz' is unavailable}} - // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}} - float X = fz(f); // #fz_call -} - -void deadCode(float f) { - // no diagnostics expected under default diagnostic mode - float A = fx(f); - float B = fy(f); - float X = fz(f); -} - -// Pixel shader -[shader("pixel")] -void mainPixel() { - F(1.0); -} - -// First Compute shader -[shader("compute")] -[numthreads(4,1,1)] -void mainCompute1() { - F(2.0); -} - -// Second compute shader to make sure we do not get duplicate messages if F is called -// from multiple entry points. -[shader("compute")] -[numthreads(4,1,1)] -void mainCompute2() { - F(3.0); -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \ +// RUN: -fsyntax-only -verify %s + +__attribute__((availability(shadermodel, introduced = 6.5))) +float fx(float); // #fx + +__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel))) +__attribute__((availability(shadermodel, introduced = 6.5, environment = compute))) +float fy(float); // #fy + +__attribute__((availability(shadermodel, introduced = 5.0, environment = compute))) +float fz(float); // #fz + + +void F(float f) { + // Make sure we only get this error once, even though this function is scanned twice - once + // in compute shader context and once in pixel shader context. + // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}} + // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}} + float A = fx(f); // #fx_call + + // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}} + // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}} + float B = fy(f); // #fy_call + + // expected-error@#fz_call {{'fz' is unavailable}} + // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}} + float X = fz(f); // #fz_call +} + +void deadCode(float f) { + // no diagnostics expected under default diagnostic mode + float A = fx(f); + float B = fy(f); + float X = fz(f); +} + +// Pixel shader +[shader("pixel")] +void mainPixel() { + F(1.0); +} + +// First Compute shader +[shader("compute")] +[numthreads(4,1,1)] +void mainCompute1() { + F(2.0); +} + +// Second compute shader to make sure we do not get duplicate messages if F is called +// from multiple entry points. +[shader("compute")] +[numthreads(4,1,1)] +void mainCompute2() { + F(3.0); +} diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl index 1ec56542113d..a472d5519dc5 100644 --- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl @@ -1,19 +1,19 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s - -typedef vector float3; - -StructuredBuffer Buffer; - -// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}} -// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} -StructuredBuffer BufferErr1; - -// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}} -// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} -StructuredBuffer<> BufferErr2; - -[numthreads(1,1,1)] -void main() { - (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer>'}} - // expected-note@* {{implicitly declared private here}} -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s + +typedef vector float3; + +StructuredBuffer Buffer; + +// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer BufferErr1; + +// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer<> BufferErr2; + +[numthreads(1,1,1)] +void main() { + (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer>'}} + // expected-note@* {{implicitly declared private here}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl index 354e7abb8a31..423f5bac9471 100644 --- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl @@ -1,43 +1,43 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify - -void test_too_few_arg() -{ - return __builtin_hlsl_cross(); - // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} -} - -void test_too_many_arg(float3 p0) -{ - return __builtin_hlsl_cross(p0, p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_cross_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_cross_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_cross(p1, p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} - -float2 builtin_cross_float2(float2 p1, float2 p2) -{ - return __builtin_hlsl_cross(p1, p2); - // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}} -} - -float3 builtin_cross_float3_int3(float3 p1, int3 p2) -{ - return __builtin_hlsl_cross(p1, p2); - // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify + +void test_too_few_arg() +{ + return __builtin_hlsl_cross(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} +} + +void test_too_many_arg(float3 p0) +{ + return __builtin_hlsl_cross(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_cross_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_cross_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_cross(p1, p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} + +float2 builtin_cross_float2(float2 p1, float2 p2) +{ + return __builtin_hlsl_cross(p1, p2); + // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}} +} + +float3 builtin_cross_float3_int3(float3 p1, int3 p2) +{ + return __builtin_hlsl_cross(p1, p2); + // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl index b876a8e84cb3..bfbd8b28257a 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl @@ -1,13 +1,13 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow - -double test_double_builtin(double p0, double p1) { - return TEST_FUNC(p0, p1); - // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}} -} - -double2 test_vec_double_builtin(double2 p0, double2 p1) { - return TEST_FUNC(p0, p1); - // expected-error@-1 {{passing 'double2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow + +double test_double_builtin(double p0, double p1) { + return TEST_FUNC(p0, p1); + // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}} +} + +double2 test_vec_double_builtin(double2 p0, double2 p1) { + return TEST_FUNC(p0, p1); + // expected-error@-1 {{passing 'double2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl index c5e2ac0b502d..281faada6f5e 100644 --- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl @@ -1,32 +1,32 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - - -void test_too_few_arg() -{ - return __builtin_hlsl_length(); - // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_length(p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_length_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_length_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_length(p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + + +void test_too_few_arg() +{ + return __builtin_hlsl_length(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_length(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_length_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_length_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_length(p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl index 3720dca9b88a..fc48c9b2589f 100644 --- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - -void test_too_few_arg() -{ - return __builtin_hlsl_normalize(); - // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_normalize(p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_normalize_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_normalize_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_normalize(p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + +void test_too_few_arg() +{ + return __builtin_hlsl_normalize(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_normalize(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_normalize_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_normalize_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_normalize(p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl index a76c5ff5dbd2..823585201ca6 100644 --- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl @@ -1,31 +1,31 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected - -void test_too_few_arg() -{ - return __builtin_hlsl_step(); - // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} -} - -void test_too_many_arg(float2 p0) -{ - return __builtin_hlsl_step(p0, p0, p0); - // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} -} - -bool builtin_bool_to_float_type_promotion(bool p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} -} - -bool builtin_step_int_to_float_promotion(int p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} -} - -bool2 builtin_step_int2_to_float2_promotion(int2 p1) -{ - return __builtin_hlsl_step(p1, p1); - // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} -} +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected + +void test_too_few_arg() +{ + return __builtin_hlsl_step(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} +} + +void test_too_many_arg(float2 p0) +{ + return __builtin_hlsl_step(p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool builtin_bool_to_float_type_promotion(bool p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}} +} + +bool builtin_step_int_to_float_promotion(int p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}} +} + +bool2 builtin_step_int2_to_float2_promotion(int2 p1) +{ + return __builtin_hlsl_step(p1, p1); + // expected-error@-1 {{passing 'int2' (aka 'vector') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}} +} diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl index 1223a131af35..8c0f8d6f271d 100644 --- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl @@ -1,81 +1,81 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s -// expected-no-diagnostics - -_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), ""); -// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported - -_Static_assert(!__builtin_hlsl_is_intangible(int), ""); -_Static_assert(!__builtin_hlsl_is_intangible(float3), ""); -_Static_assert(!__builtin_hlsl_is_intangible(half[4]), ""); - -typedef __hlsl_resource_t Res; -_Static_assert(__builtin_hlsl_is_intangible(const Res), ""); -// no need to check array of Res, arrays of sizeless types are not supported - -struct ABuffer { - const int i[10]; - __hlsl_resource_t h; -}; -_Static_assert(__builtin_hlsl_is_intangible(ABuffer), ""); -_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), ""); - -struct MyStruct { - half2 h2; - int3 i3; -}; -_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), ""); -_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), ""); - -class MyClass { - int3 ivec; - float farray[12]; - MyStruct ms; - ABuffer buf; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyClass), ""); -_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), ""); - -union U { - double d[4]; - Res buf; -}; -_Static_assert(__builtin_hlsl_is_intangible(U), ""); -_Static_assert(__builtin_hlsl_is_intangible(U[100]), ""); - -class MyClass2 { - int3 ivec; - float farray[12]; - U u; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyClass2), ""); -_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), ""); - -class Simple { - int a; -}; - -template struct TemplatedBuffer { - T a; - __hlsl_resource_t h; -}; -_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer), ""); - -struct MyStruct2 : TemplatedBuffer { - float x; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), ""); - -struct MyStruct3 { - const TemplatedBuffer TB[10]; -}; -_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), ""); - -template struct SimpleTemplate { - T a; -}; -_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); -_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); - -_Static_assert(__builtin_hlsl_is_intangible(RWBuffer), ""); -_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer), ""); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s +// expected-no-diagnostics + +_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), ""); +// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported + +_Static_assert(!__builtin_hlsl_is_intangible(int), ""); +_Static_assert(!__builtin_hlsl_is_intangible(float3), ""); +_Static_assert(!__builtin_hlsl_is_intangible(half[4]), ""); + +typedef __hlsl_resource_t Res; +_Static_assert(__builtin_hlsl_is_intangible(const Res), ""); +// no need to check array of Res, arrays of sizeless types are not supported + +struct ABuffer { + const int i[10]; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(ABuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), ""); + +struct MyStruct { + half2 h2; + int3 i3; +}; +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), ""); +_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), ""); + +class MyClass { + int3 ivec; + float farray[12]; + MyStruct ms; + ABuffer buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), ""); + +union U { + double d[4]; + Res buf; +}; +_Static_assert(__builtin_hlsl_is_intangible(U), ""); +_Static_assert(__builtin_hlsl_is_intangible(U[100]), ""); + +class MyClass2 { + int3 ivec; + float farray[12]; + U u; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyClass2), ""); +_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), ""); + +class Simple { + int a; +}; + +template struct TemplatedBuffer { + T a; + __hlsl_resource_t h; +}; +_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer), ""); + +struct MyStruct2 : TemplatedBuffer { + float x; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), ""); + +struct MyStruct3 { + const TemplatedBuffer TB[10]; +}; +_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), ""); + +template struct SimpleTemplate { + T a; +}; +_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), ""); +_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate), ""); + +_Static_assert(__builtin_hlsl_is_intangible(RWBuffer), ""); +_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer), ""); diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl index 33614e87640d..de9ac90b895f 100644 --- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl +++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl @@ -1,12 +1,12 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s - -struct Undefined; // expected-note {{forward declaration of 'Undefined'}} -_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}} - -void fn(int X) { // expected-note {{declared here}} - // expected-error@#vla {{variable length arrays are not supported for the current target}} - // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}} - // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}} - // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}} - _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla -} +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s + +struct Undefined; // expected-note {{forward declaration of 'Undefined'}} +_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}} + +void fn(int X) { // expected-note {{declared here}} + // expected-error@#vla {{variable length arrays are not supported for the current target}} + // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}} + // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}} + // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}} + _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla +} diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl index 4e50f70952ad..760c057630a7 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl @@ -1,42 +1,42 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// expected-error@+1{{binding type 't' only applies to SRV resources}} -float f1 : register(t0); - -// expected-error@+1 {{binding type 'u' only applies to UAV resources}} -float f2 : register(u0); - -// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}} -float f3 : register(b9); - -// expected-error@+1 {{binding type 's' only applies to sampler state}} -float f4 : register(s0); - -// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} -float f5 : register(i9); - -// expected-error@+1{{binding type 'x' is invalid}} -float f6 : register(x9); - -cbuffer g_cbuffer1 { -// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} - float f7 : register(c2); -}; - -tbuffer g_tbuffer1 { -// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} - float f8 : register(c2); -}; - -cbuffer g_cbuffer2 { -// expected-error@+1{{binding type 'b' only applies to constant buffer resources}} - float f9 : register(b2); -}; - -tbuffer g_tbuffer2 { -// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} - float f10 : register(i2); -}; - -// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}} -RWBuffer f11 : register(c3); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// expected-error@+1{{binding type 't' only applies to SRV resources}} +float f1 : register(t0); + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} +float f2 : register(u0); + +// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}} +float f3 : register(b9); + +// expected-error@+1 {{binding type 's' only applies to sampler state}} +float f4 : register(s0); + +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +float f5 : register(i9); + +// expected-error@+1{{binding type 'x' is invalid}} +float f6 : register(x9); + +cbuffer g_cbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f7 : register(c2); +}; + +tbuffer g_tbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f8 : register(c2); +}; + +cbuffer g_cbuffer2 { +// expected-error@+1{{binding type 'b' only applies to constant buffer resources}} + float f9 : register(b2); +}; + +tbuffer g_tbuffer2 { +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} + float f10 : register(i2); +}; + +// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}} +RWBuffer f11 : register(c3); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl index 503c8469666f..4c9e9a6b44c9 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl @@ -1,9 +1,9 @@ -// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s | FileCheck %s - -// XFAIL: * -// This expectedly fails because RayQuery is an unsupported type. -// When it becomes supported, we should expect an error due to -// the variable type being classified as "other", and according -// to the spec, err_hlsl_unsupported_register_type_and_variable_type -// should be emitted. -RayQuery<0> r1: register(t0); +// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s | FileCheck %s + +// XFAIL: * +// This expectedly fails because RayQuery is an unsupported type. +// When it becomes supported, we should expect an error due to +// the variable type being classified as "other", and according +// to the spec, err_hlsl_unsupported_register_type_and_variable_type +// should be emitted. +RayQuery<0> r1: register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl index ea43e27b5b5a..4b6af47c0ab7 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl @@ -1,49 +1,49 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// This test validates the diagnostics that are emitted when a variable with a "resource" type -// is bound to a register using the register annotation - - -template -struct MyTemplatedSRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySampler { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; -}; - -struct MyUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MyCBuffer { - __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; -}; - - -// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} -MySRV invalid : register(i2); - -// expected-error@+1 {{binding type 't' only applies to SRV resources}} -MyUAV a : register(t2, space1); - -// expected-error@+1 {{binding type 'u' only applies to UAV resources}} -MySampler b : register(u2, space1); - -// expected-error@+1 {{binding type 'b' only applies to constant buffer resources}} -MyTemplatedSRV c : register(b2); - -// expected-error@+1 {{binding type 's' only applies to sampler state}} -MyUAV d : register(s2, space1); - -// empty binding prefix cases: -// expected-error@+1 {{expected identifier}} -MyTemplatedSRV e: register(); - -// expected-error@+1 {{expected identifier}} -MyTemplatedSRV f: register(""); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// This test validates the diagnostics that are emitted when a variable with a "resource" type +// is bound to a register using the register annotation + + +template +struct MyTemplatedSRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySampler { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; +}; + +struct MyUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MyCBuffer { + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; +}; + + +// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +MySRV invalid : register(i2); + +// expected-error@+1 {{binding type 't' only applies to SRV resources}} +MyUAV a : register(t2, space1); + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} +MySampler b : register(u2, space1); + +// expected-error@+1 {{binding type 'b' only applies to constant buffer resources}} +MyTemplatedSRV c : register(b2); + +// expected-error@+1 {{binding type 's' only applies to sampler state}} +MyUAV d : register(s2, space1); + +// empty binding prefix cases: +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV e: register(); + +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV f: register(""); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl index 7f248e30c070..e63f264452da 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl @@ -1,27 +1,27 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify - -// expected-no-diagnostics -float f2 : register(b9); - -float f3 : register(i9); - -cbuffer g_cbuffer1 { - float f4 : register(c2); -}; - - -struct Eg12{ - RWBuffer a; -}; - -Eg12 e12 : register(c9); - -Eg12 bar : register(i1); - -struct Eg7 { - struct Bar { - float f; - }; - Bar b; -}; -Eg7 e7 : register(t0); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify + +// expected-no-diagnostics +float f2 : register(b9); + +float f3 : register(i9); + +cbuffer g_cbuffer1 { + float f4 : register(c2); +}; + + +struct Eg12{ + RWBuffer a; +}; + +Eg12 e12 : register(c9); + +Eg12 bar : register(i1); + +struct Eg7 { + struct Bar { + float f; + }; + Bar b; +}; +Eg7 e7 : register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl index 3001dbb1e3ec..70e64e6ca752 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl @@ -1,62 +1,62 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -// valid -cbuffer cbuf { - RWBuffer r : register(u0, space0); -} - -cbuffer cbuf2 { - struct x { - // this test validates that no diagnostic is emitted on the space parameter, because - // this register annotation is not in the global scope. - // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - RWBuffer E : register(u2, space3); - }; -} - -struct MyStruct { - RWBuffer E; -}; - -cbuffer cbuf3 { - // valid - MyStruct E : register(u2, space3); -} - -// valid -MyStruct F : register(u3, space4); - -cbuffer cbuf4 { - // this test validates that no diagnostic is emitted on the space parameter, because - // this register annotation is not in the global scope. - // expected-error@+1 {{binding type 'u' only applies to UAV resources}} - float a : register(u2, space3); -} - -// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}} -cbuffer a : register(b0, s2) { - -} - -// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}} -cbuffer b : register(b2, spaces) { - -} - -// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}} -cbuffer c : register(b2, space 3) {} - -// expected-error@+1 {{register space cannot be specified on global constants}} -int d : register(c2, space3); - -// expected-error@+1 {{register space cannot be specified on global constants}} -int e : register(c2, space0); - -// expected-error@+1 {{register space cannot be specified on global constants}} -int f : register(c2, space00); - -// valid -RWBuffer g : register(u2, space0); - -// valid -RWBuffer h : register(u2, space0); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// valid +cbuffer cbuf { + RWBuffer r : register(u0, space0); +} + +cbuffer cbuf2 { + struct x { + // this test validates that no diagnostic is emitted on the space parameter, because + // this register annotation is not in the global scope. + // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + RWBuffer E : register(u2, space3); + }; +} + +struct MyStruct { + RWBuffer E; +}; + +cbuffer cbuf3 { + // valid + MyStruct E : register(u2, space3); +} + +// valid +MyStruct F : register(u3, space4); + +cbuffer cbuf4 { + // this test validates that no diagnostic is emitted on the space parameter, because + // this register annotation is not in the global scope. + // expected-error@+1 {{binding type 'u' only applies to UAV resources}} + float a : register(u2, space3); +} + +// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}} +cbuffer a : register(b0, s2) { + +} + +// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}} +cbuffer b : register(b2, spaces) { + +} + +// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}} +cbuffer c : register(b2, space 3) {} + +// expected-error@+1 {{register space cannot be specified on global constants}} +int d : register(c2, space3); + +// expected-error@+1 {{register space cannot be specified on global constants}} +int e : register(c2, space0); + +// expected-error@+1 {{register space cannot be specified on global constants}} +int f : register(c2, space00); + +// valid +RWBuffer g : register(u2, space0); + +// valid +RWBuffer h : register(u2, space0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl index 235004102a53..40517f393e12 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl @@ -1,135 +1,135 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify - -template -struct MyTemplatedUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MySRV { - __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; -}; - -struct MySampler { - __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; -}; - -struct MyUAV { - __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; -}; - -struct MyCBuffer { - __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; -}; - -// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 -struct Eg1 { - float f; - MySRV SRVBuf; - MyUAV UAVBuf; - }; -Eg1 e1 : register(t0) : register(u0); - -// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. -// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1. -struct Eg2 { - float f; - MySRV SRVBuf; - MyUAV UAVBuf; - MyUAV UAVBuf2; - }; -Eg2 e2 : register(t0) : register(u0); - -// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. -struct Eg3 { - struct Bar { - MyUAV a; - }; - Bar b; -}; -Eg3 e3 : register(u0); - -// Valid: the first sampler state object within 's' is bound to slot 5 -struct Eg4 { - MySampler s[3]; -}; - -Eg4 e4 : register(s5); - - -struct Eg5 { - float f; -}; -// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} -Eg5 e5 : register(t0); - -struct Eg6 { - float f; -}; -// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} -Eg6 e6 : register(u0); - -struct Eg7 { - float f; -}; -// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}} -Eg7 e7 : register(b0); - -struct Eg8 { - float f; -}; -// expected-warning@+1{{binding type 's' only applies to types containing sampler state}} -Eg8 e8 : register(s0); - -struct Eg9 { - MySRV s; -}; -// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}} -Eg9 e9 : register(c0); - -struct Eg10{ - // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - MyTemplatedUAV a : register(u9); -}; -Eg10 e10; - - -template -struct Eg11 { - R b; -}; -// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} -Eg11 e11 : register(u0); -// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV - - -struct Eg12{ - MySRV s1; - MySRV s2; -}; -// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} -// expected-error@+1{{binding type 'u' cannot be applied more than once}} -Eg12 e12 : register(u9) : register(u10); - -struct Eg13{ - MySRV s1; - MySRV s2; -}; -// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} -// expected-error@+2{{binding type 'u' cannot be applied more than once}} -// expected-error@+1{{binding type 'u' cannot be applied more than once}} -Eg13 e13 : register(u9) : register(u10) : register(u11); - -// expected-error@+1{{binding type 't' cannot be applied more than once}} -Eg13 e13_2 : register(t11) : register(t12); - -struct Eg14{ - MyTemplatedUAV r1; -}; -// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} -Eg14 e14 : register(t9); - -struct Eg15 { - float f[4]; -}; -// expected no error -Eg15 e15 : register(c0); +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +template +struct MyTemplatedUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MySRV { + __hlsl_resource_t [[hlsl::resource_class(SRV)]] x; +}; + +struct MySampler { + __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x; +}; + +struct MyUAV { + __hlsl_resource_t [[hlsl::resource_class(UAV)]] x; +}; + +struct MyCBuffer { + __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x; +}; + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 +struct Eg1 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + }; +Eg1 e1 : register(t0) : register(u0); + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. +// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1. +struct Eg2 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + MyUAV UAVBuf2; + }; +Eg2 e2 : register(t0) : register(u0); + +// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. +struct Eg3 { + struct Bar { + MyUAV a; + }; + Bar b; +}; +Eg3 e3 : register(u0); + +// Valid: the first sampler state object within 's' is bound to slot 5 +struct Eg4 { + MySampler s[3]; +}; + +Eg4 e4 : register(s5); + + +struct Eg5 { + float f; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg5 e5 : register(t0); + +struct Eg6 { + float f; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg6 e6 : register(u0); + +struct Eg7 { + float f; +}; +// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}} +Eg7 e7 : register(b0); + +struct Eg8 { + float f; +}; +// expected-warning@+1{{binding type 's' only applies to types containing sampler state}} +Eg8 e8 : register(s0); + +struct Eg9 { + MySRV s; +}; +// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}} +Eg9 e9 : register(c0); + +struct Eg10{ + // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + MyTemplatedUAV a : register(u9); +}; +Eg10 e10; + + +template +struct Eg11 { + R b; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg11 e11 : register(u0); +// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV + + +struct Eg12{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg12 e12 : register(u9) : register(u10); + +struct Eg13{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+2{{binding type 'u' cannot be applied more than once}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg13 e13 : register(u9) : register(u10) : register(u11); + +// expected-error@+1{{binding type 't' cannot be applied more than once}} +Eg13 e13_2 : register(t11) : register(t12); + +struct Eg14{ + MyTemplatedUAV r1; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg14 e14 : register(t9); + +struct Eg15 { + float f[4]; +}; +// expected no error +Eg15 e15 : register(c0); diff --git a/clang/tools/scan-build/bin/scan-build.bat b/clang/tools/scan-build/bin/scan-build.bat index f765f205b8ec..77be6746318f 100644 --- a/clang/tools/scan-build/bin/scan-build.bat +++ b/clang/tools/scan-build/bin/scan-build.bat @@ -1 +1 @@ -perl -S scan-build %* +perl -S scan-build %* diff --git a/clang/tools/scan-build/libexec/c++-analyzer.bat b/clang/tools/scan-build/libexec/c++-analyzer.bat index 83c7172456a5..69f048a91671 100644 --- a/clang/tools/scan-build/libexec/c++-analyzer.bat +++ b/clang/tools/scan-build/libexec/c++-analyzer.bat @@ -1 +1 @@ -perl -S c++-analyzer %* +perl -S c++-analyzer %* diff --git a/clang/tools/scan-build/libexec/ccc-analyzer.bat b/clang/tools/scan-build/libexec/ccc-analyzer.bat index fdd36f3bdd04..2a85376eb82b 100644 --- a/clang/tools/scan-build/libexec/ccc-analyzer.bat +++ b/clang/tools/scan-build/libexec/ccc-analyzer.bat @@ -1 +1 @@ -perl -S ccc-analyzer %* +perl -S ccc-analyzer %* diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis index 611c20dacce1..a7c70186bc46 100644 --- a/clang/utils/ClangVisualizers/clang.natvis +++ b/clang/utils/ClangVisualizers/clang.natvis @@ -1,1089 +1,1089 @@ - - - - - - - LocInfoType - {(clang::Type::TypeClass)TypeBits.TC, en}Type - - {*(clang::BuiltinType *)this} - {*(clang::PointerType *)this} - {*(clang::ParenType *)this} - {(clang::BitIntType *)this} - {*(clang::LValueReferenceType *)this} - {*(clang::RValueReferenceType *)this} - {(clang::ConstantArrayType *)this,na} - {(clang::ConstantArrayType *)this,view(left)na} - {(clang::ConstantArrayType *)this,view(right)na} - {(clang::VariableArrayType *)this,na} - {(clang::VariableArrayType *)this,view(left)na} - {(clang::VariableArrayType *)this,view(right)na} - {(clang::IncompleteArrayType *)this,na} - {(clang::IncompleteArrayType *)this,view(left)na} - {(clang::IncompleteArrayType *)this,view(right)na} - {(clang::TypedefType *)this,na} - {(clang::TypedefType *)this,view(cpp)na} - {*(clang::AttributedType *)this} - {(clang::DecayedType *)this,na} - {(clang::DecayedType *)this,view(left)na} - {(clang::DecayedType *)this,view(right)na} - {(clang::ElaboratedType *)this,na} - {(clang::ElaboratedType *)this,view(left)na} - {(clang::ElaboratedType *)this,view(right)na} - {*(clang::TemplateTypeParmType *)this} - {*(clang::TemplateTypeParmType *)this,view(cpp)} - {*(clang::SubstTemplateTypeParmType *)this} - {*(clang::RecordType *)this} - {*(clang::RecordType *)this,view(cpp)} - {(clang::FunctionProtoType *)this,na} - {(clang::FunctionProtoType *)this,view(left)na} - {(clang::FunctionProtoType *)this,view(right)na} - {*(clang::TemplateSpecializationType *)this} - {*(clang::DeducedTemplateSpecializationType *)this} - {*(clang::DeducedTemplateSpecializationType *)this,view(cpp)} - {*(clang::InjectedClassNameType *)this} - {*(clang::DependentNameType *)this} - {*(clang::PackExpansionType *)this} - {(clang::LocInfoType *)this,na} - {(clang::LocInfoType *)this,view(cpp)na} - {this,view(poly)na} - {*this,view(cpp)} - - No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type - Dependence{" ",en} - - CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed - CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb} - - FromAST - - - No TypeBits set beyond TypeClass - - {*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)} - {*this,view(cmn)} {{{*this,view(poly)}}} - - (clang::Type::TypeClass)TypeBits.TC - this,view(flags)na - CanonicalType - *(clang::BuiltinType *)this - *(clang::PointerType *)this - *(clang::ParenType*)this - *(clang::BitIntType*)this - *(clang::LValueReferenceType *)this - *(clang::RValueReferenceType *)this - (clang::ConstantArrayType *)this - (clang::VariableArrayType *)this - (clang::IncompleteArrayType *)this - *(clang::AttributedType *)this - (clang::DecayedType *)this - (clang::ElaboratedType *)this - (clang::TemplateTypeParmType *)this - (clang::SubstTemplateTypeParmType *)this - (clang::RecordType *)this - (clang::FunctionProtoType *)this - (clang::TemplateSpecializationType *)this - (clang::DeducedTemplateSpecializationType *)this - (clang::InjectedClassNameType *)this - (clang::DependentNameType *)this - (clang::PackExpansionType *)this - (clang::LocInfoType *)this - - - - - ElementType - - - - {ElementType,view(cpp)} - [{Size}] - {ElementType,view(cpp)}[{Size}] - - Size - (clang::ArrayType *)this - - - - {ElementType,view(cpp)} - [] - {ElementType,view(cpp)}[] - - (clang::ArrayType *)this - - - - {ElementType,view(cpp)} - [*] - {ElementType,view(cpp)}[*] - - (clang::Expr *)SizeExpr - (clang::ArrayType *)this - - - - {Decl,view(name)nd} - {Decl} - - Decl - *(clang::Type *)this, view(cmn) - - - - {PointeeType, view(cpp)} * - - PointeeType - *(clang::Type *)this, view(cmn) - - - - {Inner, view(cpp)} - - Inner - *(clang::Type *)this, view(cmn) - - - - signed _BitInt({NumBits}) - unsigned _BitInt({NumBits})( - - NumBits - (clang::Type *)this, view(cmn) - - - - - {((clang::ReferenceType *)this)->PointeeType,view(cpp)} & - - *(clang::Type *)this, view(cmn) - PointeeType - - - - {((clang::ReferenceType *)this)->PointeeType,view(cpp)} && - - *(clang::Type *)this, view(cmn) - PointeeType - - - - {ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind} - - - - - {(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl - - (clang::Decl::Kind)DeclContextBits.DeclKind,en - - - - - FirstDecl - (clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data & ~3) - *this - - - - - - - Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}} - - - {*(clang::FunctionDecl *)this,nd} - Method {{{*this,view(cpp)}}} - - - Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}} - - - Destructor {{~{Name,view(cpp)}()}} - - - typename - class - (not yet known if parameter pack) - ... - - {(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&~3LL),view(cpp)} - {{InheritedInitializer}} - = {this,view(DefaultArg)na} - - {*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na} - - - {*TemplatedDecl,view(cpp)} - template{TemplateParams,na} {*TemplatedDecl}; - - TemplateParams,na - TemplatedDecl,na - - - - - {(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} - {(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} - {(TypeDecl *)this,view(cpp)nand} - typedef {this,view(type)na} {this,view(name)na}; - - "Not yet calculated",sb - (bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data & 2) - (clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) - (clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) - (TypeDecl *)this,nd - - - - {(TypedefNameDecl *)this,view(name)nand} - using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand} - - - {Name} - - - Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size} - - (UncommonTemplateNameStorage::Kind)Kind - Size - - - - {Bits}, - {this,view(cmn)na},{(OverloadedTemplateStorage*)this,na} - {this,view(cmn)na},{(AssumedTemplateStorage*)this,na} - {this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na} - {this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na} - {this,view(cmn)na} - - Bits - (OverloadedTemplateStorage*)this - (AssumedTemplateStorage*)this - (SubstTemplateTemplateParmStorage*)this - (SubstTemplateTemplateParmPackStorage*)this - - - - - - - {(clang::TemplateDecl *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::TemplateDecl *)(Val.Value & ~3LL),na} - - - {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),na} - - - {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),na} - - - {(clang::DependentTemplateName *)(Val.Value & ~3LL),view(cpp)na} - - - {(clang::DependentTemplateName *)(Val.Value & ~3LL),na} - - - "TemplateDecl",s8b - - (clang::TemplateDecl *)(Val.Value & ~3LL) - - "UncommonTemplateNameStorage",s8b - - (clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL) - - "QualifiedTemplateName",s8b - - (clang::QualifiedTemplateName *)(Val.Value & ~3LL) - - "DependentTemplateName",s8b - - (clang::DependentTemplateName *)(Val.Value & ~3LL) - - Val - - - - - {Storage,view(cpp)na} - {Storage,na} - - Storage - - - - {Name,view(cpp)} - {Name} - - - implicit{" ",sb} - - {*this,view(implicit)nd} - {*this,view(modifiers)}{Name,view(cpp)} - {*this,view(modifiers)nd}struct {Name,view(cpp)} - {*this,view(modifiers)nd}interface {Name,view(cpp)} - {*this,view(modifiers)nd}union {Name,view(cpp)} - {*this,view(modifiers)nd}class {Name,view(cpp)} - {*this,view(modifiers)nd}enum {Name,view(cpp)} - - (clang::DeclContext *)this - - - - {decl,view(cpp)na} - {*decl} - - *(clang::Type *)this, view(cmn) - decl - - - - {(clang::TagType *)this,view(cpp)na} - {(clang::TagType *)this,na} - - *(clang::TagType *)this - - - - {{{*Replaced,view(cpp)} <= {CanonicalType,view(cpp)}}} - - *(clang::Type *)this, view(cmn) - *Replaced - - - - - - {ResultType,view(cpp)} - - {*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)} - - , {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)} - - , {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)} - - , {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)} - - , {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)} - - , /* expand for more params */ - ({*this,view(parm0)}) -> {ResultType,view(cpp)} - ({*this,view(parm0)}) - {this,view(left)na}{this,view(right)na} - - ResultType - - {*this,view(parm0)} - - - FunctionTypeBits.NumParams - (clang::QualType *)(this+1) - - - - *(clang::Type *)this, view(cmn) - - - - - {OriginalTy} adjusted to {AdjustedTy} - - OriginalTy - AdjustedTy - - - - {OriginalTy,view(left)} - {OriginalTy,view(right)} - {OriginalTy} - - (clang::AdjustedType *)this - - - - {NamedType,view(left)} - {NamedType,view(right)} - {NamedType} - - (clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword - NNS - NamedType,view(cmn) - - - - {TTPDecl->Name,view(cpp)} - Non-canonical: {*TTPDecl} - Canonical: {CanTTPTInfo} - - *(clang::Type *)this, view(cmn) - - - - {Decl,view(cpp)} - - Decl - InjectedType - *(clang::Type *)this, view(cmn) - - - - {NNS}{Name,view(cpp)na} - - NNS - Name - *(clang::Type *)this, view(cmn) - - - - - {(IdentifierInfo*)Specifier,view(cpp)na}:: - {(NamedDecl*)Specifier,view(cpp)na}:: - {(Type*)Specifier,view(cpp)na}:: - - (NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data>>1)&3) - - - - {Pattern} - - Pattern - NumExpansions - *(clang::Type *)this, view(cmn) - - - - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(poly)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(cpp)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(left)}{*this,view(fastQuals)} - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(right)}{*this,view(fastQuals)} - - - {" ",sb}const - {" ",sb}restrict - {" ",sb}const restrict - {" ",sb}volatile - {" ",sb}const volatile - {" ",sb}volatile restrict - {" ",sb}const volatile restrict - Cannot visualize non-fast qualifiers - Null - {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,na}{*this,view(fastQuals)} - - *this,view(fastQuals) - ((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType - - - - - {DeclInfo,view(cpp)na} - {DeclInfo,na} - - DeclInfo - *(clang::Type *)this, view(cmn) - - - - {Ty,view(cpp)} - {Ty} - - Ty - - - - {(QualType *)&Ty,na} - - (QualType *)&Ty - Data - - - - Not building anything - Building a {LastTy} - - - {Argument,view(cpp)} - {Argument} - - - {*(clang::QualType *)&TypeOrValue.V,view(cpp)} - {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&TypeOrValue.V} - - {Args.Args[0]}{*this,view(arg1)} - - , {Args.Args[1]}{*this,view(arg2)} - - , {Args.Args[2]}, ... - - {Args.Args[0],view(cpp)}{*this,view(arg1cpp)} - - , {Args.Args[1],view(cpp)}{*this,view(arg2cpp)} - - , {Args.Args[2],view(cpp)}, ... - {*this,view(arg0cpp)} - {*this,view(arg0)} - {(clang::Expr *)TypeOrValue.V,view(cpp)na} - {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} - - *(clang::QualType *)&TypeOrValue.V - (clang::Expr *)TypeOrValue.V - - Args.NumArgs - Args.Args - - - - - - - {((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)} - - , {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)} - - , ... - empty - <{*this,view(elt0)}> - Uninitialized - - - - {Arguments[0],view(cpp)}{*this,view(arg1)} - - , {Arguments[1],view(cpp)}{*this,view(arg2)} - - , {Arguments[1],view(cpp)}, ... - <{*this,view(arg0)}> - - NumArguments - - NumArguments - Arguments - - - - - - {Data[0],view(cpp)}{*this,view(arg1)} - - , {Data[1],view(cpp)}{*this,view(arg2)} - - , {Data[2],view(cpp)}, ... - <{*this,view(arg0)}> - - Length - - - - Length - Data - - - - - - - - {((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)} - - ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)} - - ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ... - {*this,view(level0)} - - TemplateArgumentLists - - - - {(clang::QualType *)Arg,view(cpp)na} - Type template argument: {*(clang::QualType *)Arg} - Non-type template argument: {*(clang::Expr *)Arg} - Template template argument: {*(clang::TemplateName *)Arg - - Kind,en - (clang::QualType *)Arg - (clang::Expr *)Arg - (clang::TemplateName *)Arg - - - - - void - bool - char - unsigned char - wchar_t - char16_t - char32_t - unsigned short - unsigned int - unsigned long - unsigned long long - __uint128_t - char - signed char - wchar_t - short - int - long - long long - __int128_t - __fp16 - float - double - long double - nullptr_t - {(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en} - - (clang::BuiltinType::Kind)BuiltinTypeBits.Kind - - - - - - {((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)} - - , {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)} - - , {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)} - - {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}<{*this,view(arg0)}> - - Can't visualize this TemplateSpecializationType - - Template.Storage - - TemplateSpecializationTypeBits.NumArgs - (clang::TemplateArgument *)(this+1) - - *(clang::Type *)this, view(cmn) - - - - - (CanonicalType.Value.Value != this) || TypeBits.Dependent - *(clang::Type *)this,view(cmn) - - - - {CanonicalType,view(cpp)} - {Template,view(cpp)} - {Template} - - Template - CanonicalType,view(cpp) - (clang::DeducedType *)this - Template - - - - {*(CXXRecordDecl *)this,nd}{*TemplateArgs} - - (CXXRecordDecl *)this,nd - TemplateArgs - - - - {((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,sb} - - ((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,s - (clang::tok::TokenKind)TokenID - - - - - Empty - {*(clang::IdentifierInfo *)(Ptr & ~PtrMask)} - {{Identifier ({*(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} - {(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na} - C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na}}} - C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} - C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} - C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask)}} - {*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),view(cpp)} - {{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask)})}} - - StoredNameKind(Ptr & PtrMask),en - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na - *(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask),na - (clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),na - - - - - {(CXXDeductionGuideNameExtra *)this,view(cpp)nand} - - - {(CXXDeductionGuideNameExtra *)this,nand} - - C++ Literal operator - C++ Using directive - Objective-C MultiArg selector - {(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{" ",sb}{*this,view(cpp)} - - (CXXDeductionGuideNameExtra *)this - ExtraKindOrNumArgs - - - - {Template->TemplatedDecl,view(cpp)} - C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na} - - - {Type,view(cpp)} - {Type} - - - {Name} - - - - {(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na} - - , {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na} - - , ... - {Name,na}<{this,view(arg0)na}> - - Name - - {this,view(arg0)na} - - - NumArgs - (ParsedTemplateArgument *)(this+1) - - - - Operator - - - - {{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}} - {{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}} - {(clang::tok::TokenKind)Kind,en} - - - {BufferPtr,nasb} - - - {TheLexer._Mypair._Myval2,na} - Expanding Macro: {TheTokenLexer._Mypair._Myval2,na} - - - - - [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} - - {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} - {CurLexer._Mypair._Myval2,na} - Expanding Macro: {CurTokenLexer._Mypair._Myval2,na} - - - {this,view(cached)} - - CLK_LexAfterModuleImport - - - [{Tok}] {PP,na} - - - this - *this - {Id} - &{Id} - No visualizer for {Kind} - - - - =, - &, - - {(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na} - - ,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na} - - ,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na} - - ,... - [{this,view(default)na}{this,view(capture0)na}] - - - - , [{TypeRep}] - - - , [{ExprRep}] - - - , [{DeclRep}] - - - [{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na} - - (clang::DeclSpec::SCS)StorageClassSpec - (clang::TypeSpecifierType)TypeSpecType - - TypeRep - - - ExprRep - - - DeclRep - - - - - - {Name,s} - - - {RealPathName,s} - - - {Name,s} - - - - (clang::StorageClass)SClass - (clang::ThreadStorageClassSpecifier)TSCSpec - (clang::VarDecl::InitializationStyle)InitStyle - - - - {DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)} - - Name - DeclType - - - - {(DeclaratorDecl*)this,nand} - - (DeclaratorDecl*)this,nd - Init - VarDeclBits - - - - {*(VarDecl*)this,nd} - - ParmVarDeclBits - *(VarDecl*)this,nd - - - - {"explicit ",sb} - - explicit({ExplicitSpec,view(ptr)na}) - {ExplicitSpec,view(int)en} - {ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na} - - - {ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) & ~15))->BaseType)->ResultType,view(cpp)} - - ExplicitSpec - (bool)FunctionDeclBits.IsCopyDeductionCandidate - (FunctionDecl*)this,nd - - - - {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} - - {ParamInfo[0],na}{*this,view(parm1)nd} - - , {ParamInfo[1],na}{*this,view(parm2)nd} - - , {ParamInfo[2],na}{*this,view(parm3)nd} - - , {ParamInfo[3],na}{*this,view(parm4)nd} - - , {ParamInfo[4],na}{*this,view(parm5)nd} - - , /* expand for more params */ - - auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} - - {this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd}) - - (clang::DeclaratorDecl *)this,nd - ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType - - {*this,view(parm0)nd} - - - ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->FunctionTypeBits.NumParams - ParamInfo - - - - TemplateOrSpecialization - - - - {*($T1*)&Ptr} - - ($T1*)&Ptr - - - - {($T1 *)Ptr} - - ($T1 *)Ptr - - - - - {*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)} - - , {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)} - - , {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)} - - , {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)} - - , {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)} - - , /* Expand for more params */ - <{*this,view(parm0)}> - - - NumParams - (NamedDecl **)(this+1) - - - - - {(clang::Stmt::StmtClass)StmtBits.sClass,en} - - (clang::Stmt::StmtClass)StmtBits.sClass,en - - - - {*(clang::StringLiteral *)this} - Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)} - - - - *(unsigned *)(((clang::StringLiteral *)this)+1) - (const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8 - - - - public - protected - private - - {*(clang::NamedDecl *)(Ptr&~Mask)} - {*this,view(access)} {*this,view(decl)} - - (clang::AccessSpecifier)(Ptr&Mask),en - *(clang::NamedDecl *)(Ptr&~Mask) - - - - [IK_Identifier] {*Identifier} - [IK_OperatorFunctionId] {OperatorFunctionId} - [IK_ConversionFunctionId] {ConversionFunctionId} - [IK_ConstructorName] {ConstructorName} - [IK_DestructorName] {DestructorName} - [IK_DeductionGuideName] {TemplateName} - [IK_TemplateId] {TemplateId} - [IK_ConstructorTemplateId] {TemplateId} - Kind - - Identifier - OperatorFunctionId - ConversionFunctionId - ConstructorName - DestructorName - TemplateName - TemplateId - TemplateId - - - - NumDecls={NumDecls} - - - NumDecls - (Decl **)(this+1) - - - - - {*D} - {*(DeclGroup *)((uintptr_t)D&~1)} - - D - (DeclGroup *)((uintptr_t)D&~1) - - - - {DS} {Name} - - - {Decls} - - Decls - - - - {Ambiguity,en}: {Decls} - {ResultKind,en}: {Decls} - - - Invalid - Unset - {Val} - - - Invalid - Unset - {($T1)(Value&~1)} - - (bool)(Value&1) - ($T1)(Value&~1) - - - + + + + + + + LocInfoType + {(clang::Type::TypeClass)TypeBits.TC, en}Type + + {*(clang::BuiltinType *)this} + {*(clang::PointerType *)this} + {*(clang::ParenType *)this} + {(clang::BitIntType *)this} + {*(clang::LValueReferenceType *)this} + {*(clang::RValueReferenceType *)this} + {(clang::ConstantArrayType *)this,na} + {(clang::ConstantArrayType *)this,view(left)na} + {(clang::ConstantArrayType *)this,view(right)na} + {(clang::VariableArrayType *)this,na} + {(clang::VariableArrayType *)this,view(left)na} + {(clang::VariableArrayType *)this,view(right)na} + {(clang::IncompleteArrayType *)this,na} + {(clang::IncompleteArrayType *)this,view(left)na} + {(clang::IncompleteArrayType *)this,view(right)na} + {(clang::TypedefType *)this,na} + {(clang::TypedefType *)this,view(cpp)na} + {*(clang::AttributedType *)this} + {(clang::DecayedType *)this,na} + {(clang::DecayedType *)this,view(left)na} + {(clang::DecayedType *)this,view(right)na} + {(clang::ElaboratedType *)this,na} + {(clang::ElaboratedType *)this,view(left)na} + {(clang::ElaboratedType *)this,view(right)na} + {*(clang::TemplateTypeParmType *)this} + {*(clang::TemplateTypeParmType *)this,view(cpp)} + {*(clang::SubstTemplateTypeParmType *)this} + {*(clang::RecordType *)this} + {*(clang::RecordType *)this,view(cpp)} + {(clang::FunctionProtoType *)this,na} + {(clang::FunctionProtoType *)this,view(left)na} + {(clang::FunctionProtoType *)this,view(right)na} + {*(clang::TemplateSpecializationType *)this} + {*(clang::DeducedTemplateSpecializationType *)this} + {*(clang::DeducedTemplateSpecializationType *)this,view(cpp)} + {*(clang::InjectedClassNameType *)this} + {*(clang::DependentNameType *)this} + {*(clang::PackExpansionType *)this} + {(clang::LocInfoType *)this,na} + {(clang::LocInfoType *)this,view(cpp)na} + {this,view(poly)na} + {*this,view(cpp)} + + No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type + Dependence{" ",en} + + CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed + CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb} + + FromAST + + + No TypeBits set beyond TypeClass + + {*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)} + {*this,view(cmn)} {{{*this,view(poly)}}} + + (clang::Type::TypeClass)TypeBits.TC + this,view(flags)na + CanonicalType + *(clang::BuiltinType *)this + *(clang::PointerType *)this + *(clang::ParenType*)this + *(clang::BitIntType*)this + *(clang::LValueReferenceType *)this + *(clang::RValueReferenceType *)this + (clang::ConstantArrayType *)this + (clang::VariableArrayType *)this + (clang::IncompleteArrayType *)this + *(clang::AttributedType *)this + (clang::DecayedType *)this + (clang::ElaboratedType *)this + (clang::TemplateTypeParmType *)this + (clang::SubstTemplateTypeParmType *)this + (clang::RecordType *)this + (clang::FunctionProtoType *)this + (clang::TemplateSpecializationType *)this + (clang::DeducedTemplateSpecializationType *)this + (clang::InjectedClassNameType *)this + (clang::DependentNameType *)this + (clang::PackExpansionType *)this + (clang::LocInfoType *)this + + + + + ElementType + + + + {ElementType,view(cpp)} + [{Size}] + {ElementType,view(cpp)}[{Size}] + + Size + (clang::ArrayType *)this + + + + {ElementType,view(cpp)} + [] + {ElementType,view(cpp)}[] + + (clang::ArrayType *)this + + + + {ElementType,view(cpp)} + [*] + {ElementType,view(cpp)}[*] + + (clang::Expr *)SizeExpr + (clang::ArrayType *)this + + + + {Decl,view(name)nd} + {Decl} + + Decl + *(clang::Type *)this, view(cmn) + + + + {PointeeType, view(cpp)} * + + PointeeType + *(clang::Type *)this, view(cmn) + + + + {Inner, view(cpp)} + + Inner + *(clang::Type *)this, view(cmn) + + + + signed _BitInt({NumBits}) + unsigned _BitInt({NumBits})( + + NumBits + (clang::Type *)this, view(cmn) + + + + + {((clang::ReferenceType *)this)->PointeeType,view(cpp)} & + + *(clang::Type *)this, view(cmn) + PointeeType + + + + {((clang::ReferenceType *)this)->PointeeType,view(cpp)} && + + *(clang::Type *)this, view(cmn) + PointeeType + + + + {ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind} + + + + + {(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl + + (clang::Decl::Kind)DeclContextBits.DeclKind,en + + + + + FirstDecl + (clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data & ~3) + *this + + + + + + + Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}} + + + {*(clang::FunctionDecl *)this,nd} + Method {{{*this,view(cpp)}}} + + + Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}} + + + Destructor {{~{Name,view(cpp)}()}} + + + typename + class + (not yet known if parameter pack) + ... + + {(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&~3LL),view(cpp)} + {{InheritedInitializer}} + = {this,view(DefaultArg)na} + + {*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na} + + + {*TemplatedDecl,view(cpp)} + template{TemplateParams,na} {*TemplatedDecl}; + + TemplateParams,na + TemplatedDecl,na + + + + + {(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} + {(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL),view(cpp)na} + {(TypeDecl *)this,view(cpp)nand} + typedef {this,view(type)na} {this,view(name)na}; + + "Not yet calculated",sb + (bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data & 2) + (clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) + (clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data & ~7LL) + (TypeDecl *)this,nd + + + + {(TypedefNameDecl *)this,view(name)nand} + using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand} + + + {Name} + + + Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size} + + (UncommonTemplateNameStorage::Kind)Kind + Size + + + + {Bits}, + {this,view(cmn)na},{(OverloadedTemplateStorage*)this,na} + {this,view(cmn)na},{(AssumedTemplateStorage*)this,na} + {this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na} + {this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na} + {this,view(cmn)na} + + Bits + (OverloadedTemplateStorage*)this + (AssumedTemplateStorage*)this + (SubstTemplateTemplateParmStorage*)this + (SubstTemplateTemplateParmPackStorage*)this + + + + + + + {(clang::TemplateDecl *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::TemplateDecl *)(Val.Value & ~3LL),na} + + + {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL),na} + + + {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::QualifiedTemplateName *)(Val.Value & ~3LL),na} + + + {(clang::DependentTemplateName *)(Val.Value & ~3LL),view(cpp)na} + + + {(clang::DependentTemplateName *)(Val.Value & ~3LL),na} + + + "TemplateDecl",s8b + + (clang::TemplateDecl *)(Val.Value & ~3LL) + + "UncommonTemplateNameStorage",s8b + + (clang::UncommonTemplateNameStorage *)(Val.Value & ~3LL) + + "QualifiedTemplateName",s8b + + (clang::QualifiedTemplateName *)(Val.Value & ~3LL) + + "DependentTemplateName",s8b + + (clang::DependentTemplateName *)(Val.Value & ~3LL) + + Val + + + + + {Storage,view(cpp)na} + {Storage,na} + + Storage + + + + {Name,view(cpp)} + {Name} + + + implicit{" ",sb} + + {*this,view(implicit)nd} + {*this,view(modifiers)}{Name,view(cpp)} + {*this,view(modifiers)nd}struct {Name,view(cpp)} + {*this,view(modifiers)nd}interface {Name,view(cpp)} + {*this,view(modifiers)nd}union {Name,view(cpp)} + {*this,view(modifiers)nd}class {Name,view(cpp)} + {*this,view(modifiers)nd}enum {Name,view(cpp)} + + (clang::DeclContext *)this + + + + {decl,view(cpp)na} + {*decl} + + *(clang::Type *)this, view(cmn) + decl + + + + {(clang::TagType *)this,view(cpp)na} + {(clang::TagType *)this,na} + + *(clang::TagType *)this + + + + {{{*Replaced,view(cpp)} <= {CanonicalType,view(cpp)}}} + + *(clang::Type *)this, view(cmn) + *Replaced + + + + + + {ResultType,view(cpp)} + + {*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)} + + , {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)} + + , {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)} + + , {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)} + + , {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)} + + , /* expand for more params */ + ({*this,view(parm0)}) -> {ResultType,view(cpp)} + ({*this,view(parm0)}) + {this,view(left)na}{this,view(right)na} + + ResultType + + {*this,view(parm0)} + + + FunctionTypeBits.NumParams + (clang::QualType *)(this+1) + + + + *(clang::Type *)this, view(cmn) + + + + + {OriginalTy} adjusted to {AdjustedTy} + + OriginalTy + AdjustedTy + + + + {OriginalTy,view(left)} + {OriginalTy,view(right)} + {OriginalTy} + + (clang::AdjustedType *)this + + + + {NamedType,view(left)} + {NamedType,view(right)} + {NamedType} + + (clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword + NNS + NamedType,view(cmn) + + + + {TTPDecl->Name,view(cpp)} + Non-canonical: {*TTPDecl} + Canonical: {CanTTPTInfo} + + *(clang::Type *)this, view(cmn) + + + + {Decl,view(cpp)} + + Decl + InjectedType + *(clang::Type *)this, view(cmn) + + + + {NNS}{Name,view(cpp)na} + + NNS + Name + *(clang::Type *)this, view(cmn) + + + + + {(IdentifierInfo*)Specifier,view(cpp)na}:: + {(NamedDecl*)Specifier,view(cpp)na}:: + {(Type*)Specifier,view(cpp)na}:: + + (NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data>>1)&3) + + + + {Pattern} + + Pattern + NumExpansions + *(clang::Type *)this, view(cmn) + + + + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(poly)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(cpp)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(left)}{*this,view(fastQuals)} + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,view(right)}{*this,view(fastQuals)} + + + {" ",sb}const + {" ",sb}restrict + {" ",sb}const restrict + {" ",sb}volatile + {" ",sb}const volatile + {" ",sb}volatile restrict + {" ",sb}const volatile restrict + Cannot visualize non-fast qualifiers + Null + {((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType,na}{*this,view(fastQuals)} + + *this,view(fastQuals) + ((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) & ~(uintptr_t)((1U << clang::TypeAlignmentInBits) - 1U)))->BaseType + + + + + {DeclInfo,view(cpp)na} + {DeclInfo,na} + + DeclInfo + *(clang::Type *)this, view(cmn) + + + + {Ty,view(cpp)} + {Ty} + + Ty + + + + {(QualType *)&Ty,na} + + (QualType *)&Ty + Data + + + + Not building anything + Building a {LastTy} + + + {Argument,view(cpp)} + {Argument} + + + {*(clang::QualType *)&TypeOrValue.V,view(cpp)} + {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&TypeOrValue.V} + + {Args.Args[0]}{*this,view(arg1)} + + , {Args.Args[1]}{*this,view(arg2)} + + , {Args.Args[2]}, ... + + {Args.Args[0],view(cpp)}{*this,view(arg1cpp)} + + , {Args.Args[1],view(cpp)}{*this,view(arg2cpp)} + + , {Args.Args[2],view(cpp)}, ... + {*this,view(arg0cpp)} + {*this,view(arg0)} + {(clang::Expr *)TypeOrValue.V,view(cpp)na} + {(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} + + *(clang::QualType *)&TypeOrValue.V + (clang::Expr *)TypeOrValue.V + + Args.NumArgs + Args.Args + + + + + + + {((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)} + + , {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)} + + , ... + empty + <{*this,view(elt0)}> + Uninitialized + + + + {Arguments[0],view(cpp)}{*this,view(arg1)} + + , {Arguments[1],view(cpp)}{*this,view(arg2)} + + , {Arguments[1],view(cpp)}, ... + <{*this,view(arg0)}> + + NumArguments + + NumArguments + Arguments + + + + + + {Data[0],view(cpp)}{*this,view(arg1)} + + , {Data[1],view(cpp)}{*this,view(arg2)} + + , {Data[2],view(cpp)}, ... + <{*this,view(arg0)}> + + Length + + + + Length + Data + + + + + + + + {((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)} + + ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)} + + ::{((llvm::ArrayRef<clang::TemplateArgument> *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ... + {*this,view(level0)} + + TemplateArgumentLists + + + + {(clang::QualType *)Arg,view(cpp)na} + Type template argument: {*(clang::QualType *)Arg} + Non-type template argument: {*(clang::Expr *)Arg} + Template template argument: {*(clang::TemplateName *)Arg + + Kind,en + (clang::QualType *)Arg + (clang::Expr *)Arg + (clang::TemplateName *)Arg + + + + + void + bool + char + unsigned char + wchar_t + char16_t + char32_t + unsigned short + unsigned int + unsigned long + unsigned long long + __uint128_t + char + signed char + wchar_t + short + int + long + long long + __int128_t + __fp16 + float + double + long double + nullptr_t + {(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en} + + (clang::BuiltinType::Kind)BuiltinTypeBits.Kind + + + + + + {((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)} + + , {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)} + + , {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)} + + {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}<{*this,view(arg0)}> + + Can't visualize this TemplateSpecializationType + + Template.Storage + + TemplateSpecializationTypeBits.NumArgs + (clang::TemplateArgument *)(this+1) + + *(clang::Type *)this, view(cmn) + + + + + (CanonicalType.Value.Value != this) || TypeBits.Dependent + *(clang::Type *)this,view(cmn) + + + + {CanonicalType,view(cpp)} + {Template,view(cpp)} + {Template} + + Template + CanonicalType,view(cpp) + (clang::DeducedType *)this + Template + + + + {*(CXXRecordDecl *)this,nd}{*TemplateArgs} + + (CXXRecordDecl *)this,nd + TemplateArgs + + + + {((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,sb} + + ((llvm::StringMapEntry<clang::IdentifierInfo *>*)Entry)+1,s + (clang::tok::TokenKind)TokenID + + + + + Empty + {*(clang::IdentifierInfo *)(Ptr & ~PtrMask)} + {{Identifier ({*(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr & ~PtrMask)})}} + {(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na} + C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),view(cpp)na}}} + C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} + C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask)}} + C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask)}} + {*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),view(cpp)} + {{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask)})}} + + StoredNameKind(Ptr & PtrMask),en + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::IdentifierInfo *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXSpecialNameExtra *)(Ptr & ~PtrMask),na + *(clang::detail::CXXOperatorIdName *)(Ptr & ~PtrMask),na + (clang::detail::DeclarationNameExtra *)(Ptr & ~PtrMask),na + + + + + {(CXXDeductionGuideNameExtra *)this,view(cpp)nand} + + + {(CXXDeductionGuideNameExtra *)this,nand} + + C++ Literal operator + C++ Using directive + Objective-C MultiArg selector + {(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{" ",sb}{*this,view(cpp)} + + (CXXDeductionGuideNameExtra *)this + ExtraKindOrNumArgs + + + + {Template->TemplatedDecl,view(cpp)} + C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na} + + + {Type,view(cpp)} + {Type} + + + {Name} + + + + {(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na} + + , {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na} + + , ... + {Name,na}<{this,view(arg0)na}> + + Name + + {this,view(arg0)na} + + + NumArgs + (ParsedTemplateArgument *)(this+1) + + + + Operator + + + + {{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}} + {{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}} + {(clang::tok::TokenKind)Kind,en} + + + {BufferPtr,nasb} + + + {TheLexer._Mypair._Myval2,na} + Expanding Macro: {TheTokenLexer._Mypair._Myval2,na} + + + + + [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} + + {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na} + {CurLexer._Mypair._Myval2,na} + Expanding Macro: {CurTokenLexer._Mypair._Myval2,na} + + + {this,view(cached)} + + CLK_LexAfterModuleImport + + + [{Tok}] {PP,na} + + + this + *this + {Id} + &{Id} + No visualizer for {Kind} + + + + =, + &, + + {(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na} + + ,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na} + + ,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na} + + ,... + [{this,view(default)na}{this,view(capture0)na}] + + + + , [{TypeRep}] + + + , [{ExprRep}] + + + , [{DeclRep}] + + + [{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na} + + (clang::DeclSpec::SCS)StorageClassSpec + (clang::TypeSpecifierType)TypeSpecType + + TypeRep + + + ExprRep + + + DeclRep + + + + + + {Name,s} + + + {RealPathName,s} + + + {Name,s} + + + + (clang::StorageClass)SClass + (clang::ThreadStorageClassSpecifier)TSCSpec + (clang::VarDecl::InitializationStyle)InitStyle + + + + {DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)} + + Name + DeclType + + + + {(DeclaratorDecl*)this,nand} + + (DeclaratorDecl*)this,nd + Init + VarDeclBits + + + + {*(VarDecl*)this,nd} + + ParmVarDeclBits + *(VarDecl*)this,nd + + + + {"explicit ",sb} + + explicit({ExplicitSpec,view(ptr)na}) + {ExplicitSpec,view(int)en} + {ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na} + + + {ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) & ~15))->BaseType)->ResultType,view(cpp)} + + ExplicitSpec + (bool)FunctionDeclBits.IsCopyDeductionCandidate + (FunctionDecl*)this,nd + + + + {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} + + {ParamInfo[0],na}{*this,view(parm1)nd} + + , {ParamInfo[1],na}{*this,view(parm2)nd} + + , {ParamInfo[2],na}{*this,view(parm3)nd} + + , {ParamInfo[3],na}{*this,view(parm4)nd} + + , {ParamInfo[4],na}{*this,view(parm5)nd} + + , /* expand for more params */ + + auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -> {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType,view(cpp)} + + {this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd}) + + (clang::DeclaratorDecl *)this,nd + ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->ResultType + + {*this,view(parm0)nd} + + + ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) & ~15))->BaseType)->FunctionTypeBits.NumParams + ParamInfo + + + + TemplateOrSpecialization + + + + {*($T1*)&Ptr} + + ($T1*)&Ptr + + + + {($T1 *)Ptr} + + ($T1 *)Ptr + + + + + {*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)} + + , {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)} + + , {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)} + + , {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)} + + , {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)} + + , /* Expand for more params */ + <{*this,view(parm0)}> + + + NumParams + (NamedDecl **)(this+1) + + + + + {(clang::Stmt::StmtClass)StmtBits.sClass,en} + + (clang::Stmt::StmtClass)StmtBits.sClass,en + + + + {*(clang::StringLiteral *)this} + Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)} + + + + *(unsigned *)(((clang::StringLiteral *)this)+1) + (const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8 + + + + public + protected + private + + {*(clang::NamedDecl *)(Ptr&~Mask)} + {*this,view(access)} {*this,view(decl)} + + (clang::AccessSpecifier)(Ptr&Mask),en + *(clang::NamedDecl *)(Ptr&~Mask) + + + + [IK_Identifier] {*Identifier} + [IK_OperatorFunctionId] {OperatorFunctionId} + [IK_ConversionFunctionId] {ConversionFunctionId} + [IK_ConstructorName] {ConstructorName} + [IK_DestructorName] {DestructorName} + [IK_DeductionGuideName] {TemplateName} + [IK_TemplateId] {TemplateId} + [IK_ConstructorTemplateId] {TemplateId} + Kind + + Identifier + OperatorFunctionId + ConversionFunctionId + ConstructorName + DestructorName + TemplateName + TemplateId + TemplateId + + + + NumDecls={NumDecls} + + + NumDecls + (Decl **)(this+1) + + + + + {*D} + {*(DeclGroup *)((uintptr_t)D&~1)} + + D + (DeclGroup *)((uintptr_t)D&~1) + + + + {DS} {Name} + + + {Decls} + + Decls + + + + {Ambiguity,en}: {Decls} + {ResultKind,en}: {Decls} + + + Invalid + Unset + {Val} + + + Invalid + Unset + {($T1)(Value&~1)} + + (bool)(Value&1) + ($T1)(Value&~1) + + + diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90 index 1b7ecb604ad6..765917f07d8e 100644 --- a/flang/test/Driver/msvc-dependent-lib-flags.f90 +++ b/flang/test/Driver/msvc-dependent-lib-flags.f90 @@ -1,36 +1,36 @@ -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL -! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG - -! MSVC: -fc1 -! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-SAME: -D_MT -! MSVC-SAME: --dependent-lib=libcmt -! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib -! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib - -! MSVC-DEBUG: -fc1 -! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DEBUG-SAME: -D_MT -! MSVC-DEBUG-SAME: -D_DEBUG -! MSVC-DEBUG-SAME: --dependent-lib=libcmtd -! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib -! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib - -! MSVC-DLL: -fc1 -! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DLL-SAME: -D_MT -! MSVC-DLL-SAME: -D_DLL -! MSVC-DLL-SAME: --dependent-lib=msvcrt -! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib -! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib - -! MSVC-DLL-DEBUG: -fc1 -! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib -! MSVC-DLL-DEBUG-SAME: -D_MT -! MSVC-DLL-DEBUG-SAME: -D_DEBUG -! MSVC-DLL-DEBUG-SAME: -D_DLL -! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd -! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib -! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG + +! MSVC: -fc1 +! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-SAME: -D_MT +! MSVC-SAME: --dependent-lib=libcmt +! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib +! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib + +! MSVC-DEBUG: -fc1 +! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DEBUG-SAME: -D_MT +! MSVC-DEBUG-SAME: -D_DEBUG +! MSVC-DEBUG-SAME: --dependent-lib=libcmtd +! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib +! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib + +! MSVC-DLL: -fc1 +! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DLL-SAME: -D_MT +! MSVC-DLL-SAME: -D_DLL +! MSVC-DLL-SAME: --dependent-lib=msvcrt +! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib +! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib + +! MSVC-DLL-DEBUG: -fc1 +! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib +! MSVC-DLL-DEBUG-SAME: -D_MT +! MSVC-DLL-DEBUG-SAME: -D_DEBUG +! MSVC-DLL-DEBUG-SAME: -D_DLL +! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd +! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib +! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib diff --git a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile index d420a34c03e7..a1f689e07c77 100644 --- a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile +++ b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile @@ -1,4 +1,4 @@ - -CXX_SOURCES := main.cpp - -include Makefile.rules + +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms index e817a491af57..cab06c1c9d50 100644 --- a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms +++ b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms @@ -1,2 +1,2 @@ -MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb -PUBLIC 1000 0 main +MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb +PUBLIC 1000 0 main diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile index 745f6cc9d65a..e3b48697fd78 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile +++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile @@ -1,23 +1,23 @@ -CXX_SOURCES := main.cpp -LD_EXTRAS := -L. -l_d -l_c -l_a -l_b - -a.out: lib_b lib_a lib_c lib_d - -include Makefile.rules - -lib_a: lib_b - "$(MAKE)" -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \ - LD_EXTRAS="-L. -l_b" - -lib_b: - "$(MAKE)" -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b - -lib_c: - "$(MAKE)" -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c - -lib_d: - "$(MAKE)" -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d +CXX_SOURCES := main.cpp +LD_EXTRAS := -L. -l_d -l_c -l_a -l_b + +a.out: lib_b lib_a lib_c lib_d + +include Makefile.rules + +lib_a: lib_b + "$(MAKE)" -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \ + LD_EXTRAS="-L. -l_b" + +lib_b: + "$(MAKE)" -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b + +lib_c: + "$(MAKE)" -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c + +lib_d: + "$(MAKE)" -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp index 66633b70ee1e..778b46ed5cef 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp @@ -1,3 +1,3 @@ -extern "C" int b_function(); - -extern "C" int a_function() { return b_function(); } +extern "C" int b_function(); + +extern "C" int a_function() { return b_function(); } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp index 8b16fbdb5728..4f1a4032ee0e 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp @@ -1 +1 @@ -extern "C" int b_function() { return 500; } +extern "C" int b_function() { return 500; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp index 120c88f2bb60..8abd1b155a75 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp @@ -1 +1 @@ -extern "C" int c_function() { return 600; } +extern "C" int c_function() { return 600; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp index d37ad2621ae4..58888a29ba32 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp @@ -1 +1 @@ -extern "C" int d_function() { return 700; } +extern "C" int d_function() { return 700; } diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp index bd2c79cdab9d..77b38c5ccdc6 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp +++ b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp @@ -1,16 +1,16 @@ -#include - -extern "C" int a_function(); -extern "C" int c_function(); -extern "C" int b_function(); -extern "C" int d_function(); - -int main() { - a_function(); - b_function(); - c_function(); - d_function(); - - puts("running"); // breakpoint here - return 0; -} +#include + +extern "C" int a_function(); +extern "C" int c_function(); +extern "C" int b_function(); +extern "C" int d_function(); + +int main() { + a_function(); + b_function(); + c_function(); + d_function(); + + puts("running"); // breakpoint here + return 0; +} diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile index 10495940055b..15a931850e17 100644 --- a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile +++ b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile @@ -1,3 +1,3 @@ -C_SOURCES := main.c - -include Makefile.rules +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py index 70f72c72c834..d660844405e1 100644 --- a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py +++ b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py @@ -1,88 +1,88 @@ -""" -Test that line information is recalculated properly for a frame when it moves -from the middle of the backtrace to a zero index. - -This is a regression test for a StackFrame bug, where whether frame is zero or -not depends on an internal field. When LLDB was updating its frame list value -of the field wasn't copied into existing StackFrame instances, so those -StackFrame instances, would use an incorrect line entry evaluation logic in -situations if it was in the middle of the stack frame list (not zeroth), and -then moved to the top position. The difference in logic is that for zeroth -frames line entry is returned for program counter, while for other frame -(except for those that "behave like zeroth") it is for the instruction -preceding PC, as PC points to the next instruction after function call. When -the bug is present, when execution stops at the second breakpoint -SBFrame.GetLineEntry() returns line entry for the previous line, rather than -the one with a breakpoint. Note that this is specific to -SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return -correct entry. - -This bug doesn't reproduce through an LLDB interpretator, however it happens -when using API directly, for example in LLDB-MI. -""" - -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class ZerothFrame(TestBase): - def test(self): - """ - Test that line information is recalculated properly for a frame when it moves - from the middle of the backtrace to a zero index. - """ - self.build() - self.setTearDownCleanup() - - exe = self.getBuildArtifact("a.out") - target = self.dbg.CreateTarget(exe) - self.assertTrue(target, VALID_TARGET) - - main_dot_c = lldb.SBFileSpec("main.c") - bp1 = target.BreakpointCreateBySourceRegex( - "// Set breakpoint 1 here", main_dot_c - ) - bp2 = target.BreakpointCreateBySourceRegex( - "// Set breakpoint 2 here", main_dot_c - ) - - process = target.LaunchSimple(None, None, self.get_process_working_directory()) - self.assertTrue(process, VALID_PROCESS) - - thread = self.thread() - - if self.TraceOn(): - print("Backtrace at the first breakpoint:") - for f in thread.frames: - print(f) - - # Check that we have stopped at correct breakpoint. - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) - - # Important to use SBProcess::Continue() instead of - # self.runCmd('continue'), because the problem doesn't reproduce with - # 'continue' command. - process.Continue() - - if self.TraceOn(): - print("Backtrace at the second breakpoint:") - for f in thread.frames: - print(f) - # Check that we have stopped at the breakpoint - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) - # Double-check with GetPCAddress() - self.assertEqual( - thread.frame[0].GetLineEntry().GetLine(), - thread.frame[0].GetPCAddress().GetLineEntry().GetLine(), - "LLDB reported incorrect line number.", - ) +""" +Test that line information is recalculated properly for a frame when it moves +from the middle of the backtrace to a zero index. + +This is a regression test for a StackFrame bug, where whether frame is zero or +not depends on an internal field. When LLDB was updating its frame list value +of the field wasn't copied into existing StackFrame instances, so those +StackFrame instances, would use an incorrect line entry evaluation logic in +situations if it was in the middle of the stack frame list (not zeroth), and +then moved to the top position. The difference in logic is that for zeroth +frames line entry is returned for program counter, while for other frame +(except for those that "behave like zeroth") it is for the instruction +preceding PC, as PC points to the next instruction after function call. When +the bug is present, when execution stops at the second breakpoint +SBFrame.GetLineEntry() returns line entry for the previous line, rather than +the one with a breakpoint. Note that this is specific to +SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return +correct entry. + +This bug doesn't reproduce through an LLDB interpretator, however it happens +when using API directly, for example in LLDB-MI. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ZerothFrame(TestBase): + def test(self): + """ + Test that line information is recalculated properly for a frame when it moves + from the middle of the backtrace to a zero index. + """ + self.build() + self.setTearDownCleanup() + + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + main_dot_c = lldb.SBFileSpec("main.c") + bp1 = target.BreakpointCreateBySourceRegex( + "// Set breakpoint 1 here", main_dot_c + ) + bp2 = target.BreakpointCreateBySourceRegex( + "// Set breakpoint 2 here", main_dot_c + ) + + process = target.LaunchSimple(None, None, self.get_process_working_directory()) + self.assertTrue(process, VALID_PROCESS) + + thread = self.thread() + + if self.TraceOn(): + print("Backtrace at the first breakpoint:") + for f in thread.frames: + print(f) + + # Check that we have stopped at correct breakpoint. + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) + + # Important to use SBProcess::Continue() instead of + # self.runCmd('continue'), because the problem doesn't reproduce with + # 'continue' command. + process.Continue() + + if self.TraceOn(): + print("Backtrace at the second breakpoint:") + for f in thread.frames: + print(f) + # Check that we have stopped at the breakpoint + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) + # Double-check with GetPCAddress() + self.assertEqual( + thread.frame[0].GetLineEntry().GetLine(), + thread.frame[0].GetPCAddress().GetLineEntry().GetLine(), + "LLDB reported incorrect line number.", + ) diff --git a/lldb/test/API/python_api/debugger/Makefile b/lldb/test/API/python_api/debugger/Makefile index 99998b20bcb0..bfad5f33e867 100644 --- a/lldb/test/API/python_api/debugger/Makefile +++ b/lldb/test/API/python_api/debugger/Makefile @@ -1,3 +1,3 @@ -CXX_SOURCES := main.cpp - -include Makefile.rules +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/Shell/BuildScript/modes.test b/lldb/test/Shell/BuildScript/modes.test index 1ce50104855f..02311f712d77 100644 --- a/lldb/test/Shell/BuildScript/modes.test +++ b/lldb/test/Shell/BuildScript/modes.test @@ -1,35 +1,35 @@ -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ -RUN: | FileCheck --check-prefix=COMPILE %s - -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ -RUN: | FileCheck --check-prefix=COMPILE-MULTI %s - -RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \ -RUN: | FileCheck --check-prefix=LINK %s - -RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \ -RUN: | FileCheck --check-prefix=LINK-MULTI %s - -RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \ -RUN: | FileCheck --check-prefix=BOTH %s - -RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \ -RUN: | FileCheck --check-prefix=BOTH-MULTI %s - - -COMPILE: compiling foobar.c -> foo.out - -COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}} -COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}} - - -LINK: linking foobar.obj -> foo.exe - -LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe - -BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]] -BOTH: linking [[OBJFOO]] -> foobar.exe - -BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]] -BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]] -BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ +RUN: | FileCheck --check-prefix=COMPILE %s + +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ +RUN: | FileCheck --check-prefix=COMPILE-MULTI %s + +RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \ +RUN: | FileCheck --check-prefix=LINK %s + +RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \ +RUN: | FileCheck --check-prefix=LINK-MULTI %s + +RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \ +RUN: | FileCheck --check-prefix=BOTH %s + +RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \ +RUN: | FileCheck --check-prefix=BOTH-MULTI %s + + +COMPILE: compiling foobar.c -> foo.out + +COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}} +COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}} + + +LINK: linking foobar.obj -> foo.exe + +LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe + +BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]] +BOTH: linking [[OBJFOO]] -> foobar.exe + +BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]] +BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]] +BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe diff --git a/lldb/test/Shell/BuildScript/script-args.test b/lldb/test/Shell/BuildScript/script-args.test index 647a48e4442b..13e8a5160942 100644 --- a/lldb/test/Shell/BuildScript/script-args.test +++ b/lldb/test/Shell/BuildScript/script-args.test @@ -1,32 +1,32 @@ -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ -RUN: | FileCheck %s -RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ -RUN: | FileCheck --check-prefix=MULTI-INPUT %s - - -CHECK: Script Arguments: -CHECK-NEXT: Arch: 32 -CHECK: Compiler: any -CHECK: Outdir: {{.*}}script-args.test.tmp -CHECK: Output: {{.*}}script-args.test.tmp{{.}}foo.out -CHECK: Nodefaultlib: False -CHECK: Opt: none -CHECK: Mode: compile -CHECK: Clean: True -CHECK: Verbose: True -CHECK: Dryrun: True -CHECK: Inputs: foobar.c - -MULTI-INPUT: Script Arguments: -MULTI-INPUT-NEXT: Arch: 32 -MULTI-INPUT-NEXT: Compiler: any -MULTI-INPUT-NEXT: Outdir: {{.*}}script-args.test.tmp -MULTI-INPUT-NEXT: Output: -MULTI-INPUT-NEXT: Nodefaultlib: False -MULTI-INPUT-NEXT: Opt: none -MULTI-INPUT-NEXT: Mode: compile -MULTI-INPUT-NEXT: Clean: True -MULTI-INPUT-NEXT: Verbose: True -MULTI-INPUT-NEXT: Dryrun: True -MULTI-INPUT-NEXT: Inputs: foo.c -MULTI-INPUT-NEXT: bar.c +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \ +RUN: | FileCheck %s +RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \ +RUN: | FileCheck --check-prefix=MULTI-INPUT %s + + +CHECK: Script Arguments: +CHECK-NEXT: Arch: 32 +CHECK: Compiler: any +CHECK: Outdir: {{.*}}script-args.test.tmp +CHECK: Output: {{.*}}script-args.test.tmp{{.}}foo.out +CHECK: Nodefaultlib: False +CHECK: Opt: none +CHECK: Mode: compile +CHECK: Clean: True +CHECK: Verbose: True +CHECK: Dryrun: True +CHECK: Inputs: foobar.c + +MULTI-INPUT: Script Arguments: +MULTI-INPUT-NEXT: Arch: 32 +MULTI-INPUT-NEXT: Compiler: any +MULTI-INPUT-NEXT: Outdir: {{.*}}script-args.test.tmp +MULTI-INPUT-NEXT: Output: +MULTI-INPUT-NEXT: Nodefaultlib: False +MULTI-INPUT-NEXT: Opt: none +MULTI-INPUT-NEXT: Mode: compile +MULTI-INPUT-NEXT: Clean: True +MULTI-INPUT-NEXT: Verbose: True +MULTI-INPUT-NEXT: Dryrun: True +MULTI-INPUT-NEXT: Inputs: foo.c +MULTI-INPUT-NEXT: bar.c diff --git a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test index 4f64859a02b6..8c9ea9fddb8a 100644 --- a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test +++ b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test @@ -1,49 +1,49 @@ -REQUIRES: lld, system-windows - -RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ -RUN: | FileCheck --check-prefix=CHECK-32 %s - -RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ -RUN: | FileCheck --check-prefix=CHECK-64 %s - -CHECK-32: Script Arguments: -CHECK-32: Arch: 32 -CHECK-32: Compiler: clang-cl -CHECK-32: Outdir: {{.*}} -CHECK-32: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe -CHECK-32: Nodefaultlib: False -CHECK-32: Opt: none -CHECK-32: Mode: compile -CHECK-32: Clean: True -CHECK-32: Verbose: True -CHECK-32: Dryrun: True -CHECK-32: Inputs: foobar.c -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb -CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe -CHECK-32: compiling foobar.c -> foo.exe-foobar.obj -CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32 -CHECK-32: linking foo.exe-foobar.obj -> foo.exe -CHECK-32: {{.*}}lld-link{{(\.EXE)?}} - -CHECK-64: Script Arguments: -CHECK-64: Arch: 64 -CHECK-64: Compiler: clang-cl -CHECK-64: Outdir: {{.*}} -CHECK-64: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe -CHECK-64: Nodefaultlib: False -CHECK-64: Opt: none -CHECK-64: Mode: compile -CHECK-64: Clean: True -CHECK-64: Verbose: True -CHECK-64: Dryrun: True -CHECK-64: Inputs: foobar.c -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb -CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe -CHECK-64: compiling foobar.c -> foo.exe-foobar.obj -CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64 -CHECK-64: linking foo.exe-foobar.obj -> foo.exe -CHECK-64: {{.*}}lld-link{{(\.EXE)?}} +REQUIRES: lld, system-windows + +RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ +RUN: | FileCheck --check-prefix=CHECK-32 %s + +RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \ +RUN: | FileCheck --check-prefix=CHECK-64 %s + +CHECK-32: Script Arguments: +CHECK-32: Arch: 32 +CHECK-32: Compiler: clang-cl +CHECK-32: Outdir: {{.*}} +CHECK-32: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe +CHECK-32: Nodefaultlib: False +CHECK-32: Opt: none +CHECK-32: Mode: compile +CHECK-32: Clean: True +CHECK-32: Verbose: True +CHECK-32: Dryrun: True +CHECK-32: Inputs: foobar.c +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb +CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe +CHECK-32: compiling foobar.c -> foo.exe-foobar.obj +CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32 +CHECK-32: linking foo.exe-foobar.obj -> foo.exe +CHECK-32: {{.*}}lld-link{{(\.EXE)?}} + +CHECK-64: Script Arguments: +CHECK-64: Arch: 64 +CHECK-64: Compiler: clang-cl +CHECK-64: Outdir: {{.*}} +CHECK-64: Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe +CHECK-64: Nodefaultlib: False +CHECK-64: Opt: none +CHECK-64: Mode: compile +CHECK-64: Clean: True +CHECK-64: Verbose: True +CHECK-64: Dryrun: True +CHECK-64: Inputs: foobar.c +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb +CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe +CHECK-64: compiling foobar.c -> foo.exe-foobar.obj +CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64 +CHECK-64: linking foo.exe-foobar.obj -> foo.exe +CHECK-64: {{.*}}lld-link{{(\.EXE)?}} diff --git a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp index d5b96472eb11..6bf78b5dc43b 100644 --- a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp +++ b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp @@ -1,40 +1,40 @@ - -// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib - -#ifdef USE_CRT -#include -#else -int main(); -extern "C" -{ - int _fltused; - void mainCRTStartup() { main(); } - void printf(const char*, ...) {} -} -#endif - -void crash(bool crash_self) -{ - printf("Before...\n"); - if(crash_self) - { - printf("Crashing in 3, 2, 1 ...\n"); - *(volatile int*)nullptr = 0; - } - printf("After...\n"); -} - -int foo(int x, float y, const char* msg) -{ - bool flag = x > y; - if(flag) - printf("x = %d, y = %f, msg = %s\n", x, y, msg); - crash(flag); - return x << 1; -} - -int main() -{ - foo(10, 3.14, "testing"); -} - + +// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib + +#ifdef USE_CRT +#include +#else +int main(); +extern "C" +{ + int _fltused; + void mainCRTStartup() { main(); } + void printf(const char*, ...) {} +} +#endif + +void crash(bool crash_self) +{ + printf("Before...\n"); + if(crash_self) + { + printf("Crashing in 3, 2, 1 ...\n"); + *(volatile int*)nullptr = 0; + } + printf("After...\n"); +} + +int foo(int x, float y, const char* msg) +{ + bool flag = x > y; + if(flag) + printf("x = %d, y = %f, msg = %s\n", x, y, msg); + crash(flag); + return x << 1; +} + +int main() +{ + foo(10, 3.14, "testing"); +} + diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s index a9d248758bfc..aac8f4c16980 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s @@ -1,622 +1,622 @@ -# Compiled from the following files, but replaced the call to abort with nop. -# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp -# a.cpp: -# #include "a.h" -# int main(int argc, char** argv) { -# volatile int main_local = Namespace1::foo(2); -# return 0; -# } -# a.h: -# #include -# #include "b.h" -# namespace Namespace1 { -# inline int foo(int x) { -# volatile int foo_local = x + 1; -# ++foo_local; -# if (!foo_local) -# abort(); -# return Class1::bar(foo_local); -# } -# } // namespace Namespace1 -# b.h: -# #include "c.h" -# class Class1 { -# public: -# inline static int bar(int x) { -# volatile int bar_local = x + 1; -# ++bar_local; -# return Namespace2::Class2::func(bar_local); -# } -# }; -# c.h: -# namespace Namespace2 { -# class Class2 { -# public: -# inline static int func(int x) { -# volatile int func_local = x + 1; -# func_local += x; -# return func_local; -# } -# }; -# } // namespace Namespace2 - - .text - .def @feat.00; - .scl 3; - .type 0; - .endef - .globl @feat.00 -.set @feat.00, 0 - .intel_syntax noprefix - .file "a.cpp" - .def main; - .scl 2; - .type 32; - .endef - .section .text,"xr",one_only,main - .globl main # -- Begin function main -main: # @main -.Lfunc_begin0: - .cv_func_id 0 - .cv_file 1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1 - .cv_loc 0 1 2 0 # a.cpp:2:0 -.seh_proc main -# %bb.0: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - sub rsp, 56 - .seh_stackalloc 56 - .seh_endprologue -.Ltmp0: - .cv_file 2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1 - .cv_inline_site_id 1 within 0 inlined_at 1 3 0 - .cv_loc 1 2 5 0 # ./a.h:5:0 - mov dword ptr [rsp + 44], 3 - .cv_loc 1 2 6 0 # ./a.h:6:0 - inc dword ptr [rsp + 44] - .cv_loc 1 2 7 0 # ./a.h:7:0 - mov eax, dword ptr [rsp + 44] - test eax, eax - je .LBB0_2 -.Ltmp1: -# %bb.1: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - .cv_loc 1 2 9 0 # ./a.h:9:0 - mov eax, dword ptr [rsp + 44] -.Ltmp2: - #DEBUG_VALUE: bar:x <- $eax - .cv_file 3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1 - .cv_inline_site_id 2 within 1 inlined_at 2 9 0 - .cv_loc 2 3 5 0 # ./b.h:5:0 - inc eax -.Ltmp3: - mov dword ptr [rsp + 52], eax - .cv_loc 2 3 6 0 # ./b.h:6:0 - inc dword ptr [rsp + 52] - .cv_loc 2 3 7 0 # ./b.h:7:0 - mov eax, dword ptr [rsp + 52] -.Ltmp4: - #DEBUG_VALUE: func:x <- $eax - .cv_file 4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1 - .cv_inline_site_id 3 within 2 inlined_at 3 7 0 - .cv_loc 3 4 5 0 # ./c.h:5:0 - lea ecx, [rax + 1] -.Ltmp5: - #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx - mov dword ptr [rsp + 48], ecx - .cv_loc 3 4 6 0 # ./c.h:6:0 - add dword ptr [rsp + 48], eax - .cv_loc 3 4 7 0 # ./c.h:7:0 - mov eax, dword ptr [rsp + 48] -.Ltmp6: - .cv_loc 0 1 3 0 # a.cpp:3:0 - mov dword ptr [rsp + 48], eax - .cv_loc 0 1 4 0 # a.cpp:4:0 - xor eax, eax - # Use fake debug info to tests inline info. - .cv_loc 1 2 20 0 - add rsp, 56 - ret -.Ltmp7: -.LBB0_2: - #DEBUG_VALUE: main:argv <- $rdx - #DEBUG_VALUE: main:argc <- $ecx - #DEBUG_VALUE: foo:x <- 2 - .cv_loc 1 2 8 0 # ./a.h:8:0 - nop -.Ltmp8: - int3 -.Ltmp9: - #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx - #DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx -.Lfunc_end0: - .seh_endproc - # -- End function - .section .drectve,"yn" - .ascii " /DEFAULTLIB:libcmt.lib" - .ascii " /DEFAULTLIB:oldnames.lib" - .section .debug$S,"dr" - .p2align 2 - .long 4 # Debug section magic - .long 241 - .long .Ltmp11-.Ltmp10 # Subsection size -.Ltmp10: - .short .Ltmp13-.Ltmp12 # Record length -.Ltmp12: - .short 4353 # Record kind: S_OBJNAME - .long 0 # Signature - .asciz "/tmp/a-2b2ba0.obj" # Object name - .p2align 2 -.Ltmp13: - .short .Ltmp15-.Ltmp14 # Record length -.Ltmp14: - .short 4412 # Record kind: S_COMPILE3 - .long 1 # Flags and language - .short 208 # CPUType - .short 15 # Frontend version - .short 0 - .short 0 - .short 0 - .short 15000 # Backend version - .short 0 - .short 0 - .short 0 - .asciz "clang version 15.0.0" # Null-terminated compiler version string - .p2align 2 -.Ltmp15: -.Ltmp11: - .p2align 2 - .long 246 # Inlinee lines subsection - .long .Ltmp17-.Ltmp16 # Subsection size -.Ltmp16: - .long 0 # Inlinee lines signature - - # Inlined function foo starts at ./a.h:4 - .long 4099 # Type index of inlined function - .cv_filechecksumoffset 2 # Offset into filechecksum table - .long 4 # Starting line number - - # Inlined function bar starts at ./b.h:4 - .long 4106 # Type index of inlined function - .cv_filechecksumoffset 3 # Offset into filechecksum table - .long 4 # Starting line number - - # Inlined function func starts at ./c.h:4 - .long 4113 # Type index of inlined function - .cv_filechecksumoffset 4 # Offset into filechecksum table - .long 4 # Starting line number -.Ltmp17: - .p2align 2 - .section .debug$S,"dr",associative,main - .p2align 2 - .long 4 # Debug section magic - .long 241 # Symbol subsection for main - .long .Ltmp19-.Ltmp18 # Subsection size -.Ltmp18: - .short .Ltmp21-.Ltmp20 # Record length -.Ltmp20: - .short 4423 # Record kind: S_GPROC32_ID - .long 0 # PtrParent - .long 0 # PtrEnd - .long 0 # PtrNext - .long .Lfunc_end0-main # Code size - .long 0 # Offset after prologue - .long 0 # Offset before epilogue - .long 4117 # Function type index - .secrel32 main # Function section relative address - .secidx main # Function section index - .byte 0 # Flags - .asciz "main" # Function name - .p2align 2 -.Ltmp21: - .short .Ltmp23-.Ltmp22 # Record length -.Ltmp22: - .short 4114 # Record kind: S_FRAMEPROC - .long 56 # FrameSize - .long 0 # Padding - .long 0 # Offset of padding - .long 0 # Bytes of callee saved registers - .long 0 # Exception handler offset - .short 0 # Exception handler section - .long 81920 # Flags (defines frame register) - .p2align 2 -.Ltmp23: - .short .Ltmp25-.Ltmp24 # Record length -.Ltmp24: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "argc" - .p2align 2 -.Ltmp25: - .cv_def_range .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18 - .short .Ltmp27-.Ltmp26 # Record length -.Ltmp26: - .short 4414 # Record kind: S_LOCAL - .long 4114 # TypeIndex - .short 1 # Flags - .asciz "argv" - .p2align 2 -.Ltmp27: - .cv_def_range .Lfunc_begin0 .Ltmp8, reg, 331 - .short .Ltmp29-.Ltmp28 # Record length -.Ltmp28: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "main_local" - .p2align 2 -.Ltmp29: - .cv_def_range .Ltmp0 .Ltmp9, frame_ptr_rel, 48 - .short .Ltmp31-.Ltmp30 # Record length -.Ltmp30: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4099 # Inlinee type index - .cv_inline_linetable 1 2 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp31: - .short .Ltmp33-.Ltmp32 # Record length -.Ltmp32: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 257 # Flags - .asciz "x" - .p2align 2 -.Ltmp33: - .short .Ltmp35-.Ltmp34 # Record length -.Ltmp34: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "foo_local" - .p2align 2 -.Ltmp35: - .cv_def_range .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44 - .short .Ltmp37-.Ltmp36 # Record length -.Ltmp36: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4106 # Inlinee type index - .cv_inline_linetable 2 3 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp37: - .short .Ltmp39-.Ltmp38 # Record length -.Ltmp38: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "x" - .p2align 2 -.Ltmp39: - .cv_def_range .Ltmp2 .Ltmp3, reg, 17 - .short .Ltmp41-.Ltmp40 # Record length -.Ltmp40: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "bar_local" - .p2align 2 -.Ltmp41: - .cv_def_range .Ltmp2 .Ltmp6, frame_ptr_rel, 52 - .short .Ltmp43-.Ltmp42 # Record length -.Ltmp42: - .short 4429 # Record kind: S_INLINESITE - .long 0 # PtrParent - .long 0 # PtrEnd - .long 4113 # Inlinee type index - .cv_inline_linetable 3 4 4 .Lfunc_begin0 .Lfunc_end0 - .p2align 2 -.Ltmp43: - .short .Ltmp45-.Ltmp44 # Record length -.Ltmp44: - .short 4414 # Record kind: S_LOCAL - .long 116 # TypeIndex - .short 1 # Flags - .asciz "x" - .p2align 2 -.Ltmp45: - .cv_def_range .Ltmp4 .Ltmp6, reg, 17 - .short .Ltmp47-.Ltmp46 # Record length -.Ltmp46: - .short 4414 # Record kind: S_LOCAL - .long 4118 # TypeIndex - .short 0 # Flags - .asciz "func_local" - .p2align 2 -.Ltmp47: - .cv_def_range .Ltmp4 .Ltmp6, frame_ptr_rel, 48 - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4430 # Record kind: S_INLINESITE_END - .short 2 # Record length - .short 4431 # Record kind: S_PROC_ID_END -.Ltmp19: - .p2align 2 - .cv_linetable 0, main, .Lfunc_end0 - .section .debug$S,"dr" - .long 241 - .long .Ltmp49-.Ltmp48 # Subsection size -.Ltmp48: - .short .Ltmp51-.Ltmp50 # Record length -.Ltmp50: - .short 4360 # Record kind: S_UDT - .long 4103 # Type - .asciz "Class1" - .p2align 2 -.Ltmp51: - .short .Ltmp53-.Ltmp52 # Record length -.Ltmp52: - .short 4360 # Record kind: S_UDT - .long 4110 # Type - .asciz "Namespace2::Class2" - .p2align 2 -.Ltmp53: -.Ltmp49: - .p2align 2 - .cv_filechecksums # File index to string table offset subsection - .cv_stringtable # String table - .long 241 - .long .Ltmp55-.Ltmp54 # Subsection size -.Ltmp54: - .short .Ltmp57-.Ltmp56 # Record length -.Ltmp56: - .short 4428 # Record kind: S_BUILDINFO - .long 4124 # LF_BUILDINFO index - .p2align 2 -.Ltmp57: -.Ltmp55: - .p2align 2 - .section .debug$T,"dr" - .p2align 2 - .long 4 # Debug section magic - # StringId (0x1000) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "Namespace1" # StringData - .byte 241 - # ArgList (0x1001) - .short 0xa # Record length - .short 0x1201 # Record kind: LF_ARGLIST - .long 0x1 # NumArgs - .long 0x74 # Argument: int - # Procedure (0x1002) - .short 0xe # Record length - .short 0x1008 # Record kind: LF_PROCEDURE - .long 0x74 # ReturnType: int - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - # FuncId (0x1003) - .short 0xe # Record length - .short 0x1601 # Record kind: LF_FUNC_ID - .long 0x1000 # ParentScope: Namespace1 - .long 0x1002 # FunctionType: int (int) - .asciz "foo" # Name - # Class (0x1004) - .short 0x2a # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x0 # MemberCount - .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) - .long 0x0 # FieldList - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x0 # SizeOf - .asciz "Class1" # Name - .asciz ".?AVClass1@@" # LinkageName - .byte 242 - .byte 241 - # MemberFunction (0x1005) - .short 0x1a # Record length - .short 0x1009 # Record kind: LF_MFUNCTION - .long 0x74 # ReturnType: int - .long 0x1004 # ClassType: Class1 - .long 0x0 # ThisType - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - .long 0x0 # ThisAdjustment - # FieldList (0x1006) - .short 0xe # Record length - .short 0x1203 # Record kind: LF_FIELDLIST - .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) - .short 0xb # Attrs: Public, Static - .long 0x1005 # Type: int Class1::(int) - .asciz "bar" # Name - # Class (0x1007) - .short 0x2a # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x1 # MemberCount - .short 0x200 # Properties ( HasUniqueName (0x200) ) - .long 0x1006 # FieldList: - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x1 # SizeOf - .asciz "Class1" # Name - .asciz ".?AVClass1@@" # LinkageName - .byte 242 - .byte 241 - # StringId (0x1008) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp/./b.h" # StringData - .byte 241 - # UdtSourceLine (0x1009) - .short 0xe # Record length - .short 0x1606 # Record kind: LF_UDT_SRC_LINE - .long 0x1007 # UDT: Class1 - .long 0x1008 # SourceFile: /tmp/./b.h - .long 0x2 # LineNumber - # MemberFuncId (0x100A) - .short 0xe # Record length - .short 0x1602 # Record kind: LF_MFUNC_ID - .long 0x1004 # ClassType: Class1 - .long 0x1005 # FunctionType: int Class1::(int) - .asciz "bar" # Name - # Class (0x100B) - .short 0x42 # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x0 # MemberCount - .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) - .long 0x0 # FieldList - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x0 # SizeOf - .asciz "Namespace2::Class2" # Name - .asciz ".?AVClass2@Namespace2@@" # LinkageName - .byte 243 - .byte 242 - .byte 241 - # MemberFunction (0x100C) - .short 0x1a # Record length - .short 0x1009 # Record kind: LF_MFUNCTION - .long 0x74 # ReturnType: int - .long 0x100b # ClassType: Namespace2::Class2 - .long 0x0 # ThisType - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x1 # NumParameters - .long 0x1001 # ArgListType: (int) - .long 0x0 # ThisAdjustment - # FieldList (0x100D) - .short 0x12 # Record length - .short 0x1203 # Record kind: LF_FIELDLIST - .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) - .short 0xb # Attrs: Public, Static - .long 0x100c # Type: int Namespace2::Class2::(int) - .asciz "func" # Name - .byte 243 - .byte 242 - .byte 241 - # Class (0x100E) - .short 0x42 # Record length - .short 0x1504 # Record kind: LF_CLASS - .short 0x1 # MemberCount - .short 0x200 # Properties ( HasUniqueName (0x200) ) - .long 0x100d # FieldList: - .long 0x0 # DerivedFrom - .long 0x0 # VShape - .short 0x1 # SizeOf - .asciz "Namespace2::Class2" # Name - .asciz ".?AVClass2@Namespace2@@" # LinkageName - .byte 243 - .byte 242 - .byte 241 - # StringId (0x100F) - .short 0x12 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp/./c.h" # StringData - .byte 241 - # UdtSourceLine (0x1010) - .short 0xe # Record length - .short 0x1606 # Record kind: LF_UDT_SRC_LINE - .long 0x100e # UDT: Namespace2::Class2 - .long 0x100f # SourceFile: /tmp/./c.h - .long 0x2 # LineNumber - # MemberFuncId (0x1011) - .short 0x12 # Record length - .short 0x1602 # Record kind: LF_MFUNC_ID - .long 0x100b # ClassType: Namespace2::Class2 - .long 0x100c # FunctionType: int Namespace2::Class2::(int) - .asciz "func" # Name - .byte 243 - .byte 242 - .byte 241 - # Pointer (0x1012) - .short 0xa # Record length - .short 0x1002 # Record kind: LF_POINTER - .long 0x670 # PointeeType: char* - .long 0x1000c # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ] - # ArgList (0x1013) - .short 0xe # Record length - .short 0x1201 # Record kind: LF_ARGLIST - .long 0x2 # NumArgs - .long 0x74 # Argument: int - .long 0x1012 # Argument: char** - # Procedure (0x1014) - .short 0xe # Record length - .short 0x1008 # Record kind: LF_PROCEDURE - .long 0x74 # ReturnType: int - .byte 0x0 # CallingConvention: NearC - .byte 0x0 # FunctionOptions - .short 0x2 # NumParameters - .long 0x1013 # ArgListType: (int, char**) - # FuncId (0x1015) - .short 0x12 # Record length - .short 0x1601 # Record kind: LF_FUNC_ID - .long 0x0 # ParentScope - .long 0x1014 # FunctionType: int (int, char**) - .asciz "main" # Name - .byte 243 - .byte 242 - .byte 241 - # Modifier (0x1016) - .short 0xa # Record length - .short 0x1001 # Record kind: LF_MODIFIER - .long 0x74 # ModifiedType: int - .short 0x2 # Modifiers ( Volatile (0x2) ) - .byte 242 - .byte 241 - # StringId (0x1017) - .short 0xe # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/tmp" # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x1018) - .short 0xe # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "a.cpp" # StringData - .byte 242 - .byte 241 - # StringId (0x1019) - .short 0xa # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .byte 0 # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x101A) - .short 0x4e # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData - .byte 243 - .byte 242 - .byte 241 - # StringId (0x101B) - .short 0x9f6 # Record length - .short 0x1605 # Record kind: LF_STRING_ID - .long 0x0 # Id - .asciz "\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData - .byte 242 - .byte 241 - # BuildInfo (0x101C) - .short 0x1a # Record length - .short 0x1603 # Record kind: LF_BUILDINFO - .short 0x5 # NumArgs - .long 0x1017 # Argument: /tmp - .long 0x101a # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang - .long 0x1018 # Argument: a.cpp - .long 0x1019 # Argument - .long 0x101b # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++" - .byte 242 - .byte 241 - .addrsig +# Compiled from the following files, but replaced the call to abort with nop. +# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp +# a.cpp: +# #include "a.h" +# int main(int argc, char** argv) { +# volatile int main_local = Namespace1::foo(2); +# return 0; +# } +# a.h: +# #include +# #include "b.h" +# namespace Namespace1 { +# inline int foo(int x) { +# volatile int foo_local = x + 1; +# ++foo_local; +# if (!foo_local) +# abort(); +# return Class1::bar(foo_local); +# } +# } // namespace Namespace1 +# b.h: +# #include "c.h" +# class Class1 { +# public: +# inline static int bar(int x) { +# volatile int bar_local = x + 1; +# ++bar_local; +# return Namespace2::Class2::func(bar_local); +# } +# }; +# c.h: +# namespace Namespace2 { +# class Class2 { +# public: +# inline static int func(int x) { +# volatile int func_local = x + 1; +# func_local += x; +# return func_local; +# } +# }; +# } // namespace Namespace2 + + .text + .def @feat.00; + .scl 3; + .type 0; + .endef + .globl @feat.00 +.set @feat.00, 0 + .intel_syntax noprefix + .file "a.cpp" + .def main; + .scl 2; + .type 32; + .endef + .section .text,"xr",one_only,main + .globl main # -- Begin function main +main: # @main +.Lfunc_begin0: + .cv_func_id 0 + .cv_file 1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1 + .cv_loc 0 1 2 0 # a.cpp:2:0 +.seh_proc main +# %bb.0: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + sub rsp, 56 + .seh_stackalloc 56 + .seh_endprologue +.Ltmp0: + .cv_file 2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1 + .cv_inline_site_id 1 within 0 inlined_at 1 3 0 + .cv_loc 1 2 5 0 # ./a.h:5:0 + mov dword ptr [rsp + 44], 3 + .cv_loc 1 2 6 0 # ./a.h:6:0 + inc dword ptr [rsp + 44] + .cv_loc 1 2 7 0 # ./a.h:7:0 + mov eax, dword ptr [rsp + 44] + test eax, eax + je .LBB0_2 +.Ltmp1: +# %bb.1: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + .cv_loc 1 2 9 0 # ./a.h:9:0 + mov eax, dword ptr [rsp + 44] +.Ltmp2: + #DEBUG_VALUE: bar:x <- $eax + .cv_file 3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1 + .cv_inline_site_id 2 within 1 inlined_at 2 9 0 + .cv_loc 2 3 5 0 # ./b.h:5:0 + inc eax +.Ltmp3: + mov dword ptr [rsp + 52], eax + .cv_loc 2 3 6 0 # ./b.h:6:0 + inc dword ptr [rsp + 52] + .cv_loc 2 3 7 0 # ./b.h:7:0 + mov eax, dword ptr [rsp + 52] +.Ltmp4: + #DEBUG_VALUE: func:x <- $eax + .cv_file 4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1 + .cv_inline_site_id 3 within 2 inlined_at 3 7 0 + .cv_loc 3 4 5 0 # ./c.h:5:0 + lea ecx, [rax + 1] +.Ltmp5: + #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx + mov dword ptr [rsp + 48], ecx + .cv_loc 3 4 6 0 # ./c.h:6:0 + add dword ptr [rsp + 48], eax + .cv_loc 3 4 7 0 # ./c.h:7:0 + mov eax, dword ptr [rsp + 48] +.Ltmp6: + .cv_loc 0 1 3 0 # a.cpp:3:0 + mov dword ptr [rsp + 48], eax + .cv_loc 0 1 4 0 # a.cpp:4:0 + xor eax, eax + # Use fake debug info to tests inline info. + .cv_loc 1 2 20 0 + add rsp, 56 + ret +.Ltmp7: +.LBB0_2: + #DEBUG_VALUE: main:argv <- $rdx + #DEBUG_VALUE: main:argc <- $ecx + #DEBUG_VALUE: foo:x <- 2 + .cv_loc 1 2 8 0 # ./a.h:8:0 + nop +.Ltmp8: + int3 +.Ltmp9: + #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx + #DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx +.Lfunc_end0: + .seh_endproc + # -- End function + .section .drectve,"yn" + .ascii " /DEFAULTLIB:libcmt.lib" + .ascii " /DEFAULTLIB:oldnames.lib" + .section .debug$S,"dr" + .p2align 2 + .long 4 # Debug section magic + .long 241 + .long .Ltmp11-.Ltmp10 # Subsection size +.Ltmp10: + .short .Ltmp13-.Ltmp12 # Record length +.Ltmp12: + .short 4353 # Record kind: S_OBJNAME + .long 0 # Signature + .asciz "/tmp/a-2b2ba0.obj" # Object name + .p2align 2 +.Ltmp13: + .short .Ltmp15-.Ltmp14 # Record length +.Ltmp14: + .short 4412 # Record kind: S_COMPILE3 + .long 1 # Flags and language + .short 208 # CPUType + .short 15 # Frontend version + .short 0 + .short 0 + .short 0 + .short 15000 # Backend version + .short 0 + .short 0 + .short 0 + .asciz "clang version 15.0.0" # Null-terminated compiler version string + .p2align 2 +.Ltmp15: +.Ltmp11: + .p2align 2 + .long 246 # Inlinee lines subsection + .long .Ltmp17-.Ltmp16 # Subsection size +.Ltmp16: + .long 0 # Inlinee lines signature + + # Inlined function foo starts at ./a.h:4 + .long 4099 # Type index of inlined function + .cv_filechecksumoffset 2 # Offset into filechecksum table + .long 4 # Starting line number + + # Inlined function bar starts at ./b.h:4 + .long 4106 # Type index of inlined function + .cv_filechecksumoffset 3 # Offset into filechecksum table + .long 4 # Starting line number + + # Inlined function func starts at ./c.h:4 + .long 4113 # Type index of inlined function + .cv_filechecksumoffset 4 # Offset into filechecksum table + .long 4 # Starting line number +.Ltmp17: + .p2align 2 + .section .debug$S,"dr",associative,main + .p2align 2 + .long 4 # Debug section magic + .long 241 # Symbol subsection for main + .long .Ltmp19-.Ltmp18 # Subsection size +.Ltmp18: + .short .Ltmp21-.Ltmp20 # Record length +.Ltmp20: + .short 4423 # Record kind: S_GPROC32_ID + .long 0 # PtrParent + .long 0 # PtrEnd + .long 0 # PtrNext + .long .Lfunc_end0-main # Code size + .long 0 # Offset after prologue + .long 0 # Offset before epilogue + .long 4117 # Function type index + .secrel32 main # Function section relative address + .secidx main # Function section index + .byte 0 # Flags + .asciz "main" # Function name + .p2align 2 +.Ltmp21: + .short .Ltmp23-.Ltmp22 # Record length +.Ltmp22: + .short 4114 # Record kind: S_FRAMEPROC + .long 56 # FrameSize + .long 0 # Padding + .long 0 # Offset of padding + .long 0 # Bytes of callee saved registers + .long 0 # Exception handler offset + .short 0 # Exception handler section + .long 81920 # Flags (defines frame register) + .p2align 2 +.Ltmp23: + .short .Ltmp25-.Ltmp24 # Record length +.Ltmp24: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "argc" + .p2align 2 +.Ltmp25: + .cv_def_range .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18 + .short .Ltmp27-.Ltmp26 # Record length +.Ltmp26: + .short 4414 # Record kind: S_LOCAL + .long 4114 # TypeIndex + .short 1 # Flags + .asciz "argv" + .p2align 2 +.Ltmp27: + .cv_def_range .Lfunc_begin0 .Ltmp8, reg, 331 + .short .Ltmp29-.Ltmp28 # Record length +.Ltmp28: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "main_local" + .p2align 2 +.Ltmp29: + .cv_def_range .Ltmp0 .Ltmp9, frame_ptr_rel, 48 + .short .Ltmp31-.Ltmp30 # Record length +.Ltmp30: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4099 # Inlinee type index + .cv_inline_linetable 1 2 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp31: + .short .Ltmp33-.Ltmp32 # Record length +.Ltmp32: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 257 # Flags + .asciz "x" + .p2align 2 +.Ltmp33: + .short .Ltmp35-.Ltmp34 # Record length +.Ltmp34: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "foo_local" + .p2align 2 +.Ltmp35: + .cv_def_range .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44 + .short .Ltmp37-.Ltmp36 # Record length +.Ltmp36: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4106 # Inlinee type index + .cv_inline_linetable 2 3 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp37: + .short .Ltmp39-.Ltmp38 # Record length +.Ltmp38: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "x" + .p2align 2 +.Ltmp39: + .cv_def_range .Ltmp2 .Ltmp3, reg, 17 + .short .Ltmp41-.Ltmp40 # Record length +.Ltmp40: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "bar_local" + .p2align 2 +.Ltmp41: + .cv_def_range .Ltmp2 .Ltmp6, frame_ptr_rel, 52 + .short .Ltmp43-.Ltmp42 # Record length +.Ltmp42: + .short 4429 # Record kind: S_INLINESITE + .long 0 # PtrParent + .long 0 # PtrEnd + .long 4113 # Inlinee type index + .cv_inline_linetable 3 4 4 .Lfunc_begin0 .Lfunc_end0 + .p2align 2 +.Ltmp43: + .short .Ltmp45-.Ltmp44 # Record length +.Ltmp44: + .short 4414 # Record kind: S_LOCAL + .long 116 # TypeIndex + .short 1 # Flags + .asciz "x" + .p2align 2 +.Ltmp45: + .cv_def_range .Ltmp4 .Ltmp6, reg, 17 + .short .Ltmp47-.Ltmp46 # Record length +.Ltmp46: + .short 4414 # Record kind: S_LOCAL + .long 4118 # TypeIndex + .short 0 # Flags + .asciz "func_local" + .p2align 2 +.Ltmp47: + .cv_def_range .Ltmp4 .Ltmp6, frame_ptr_rel, 48 + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4430 # Record kind: S_INLINESITE_END + .short 2 # Record length + .short 4431 # Record kind: S_PROC_ID_END +.Ltmp19: + .p2align 2 + .cv_linetable 0, main, .Lfunc_end0 + .section .debug$S,"dr" + .long 241 + .long .Ltmp49-.Ltmp48 # Subsection size +.Ltmp48: + .short .Ltmp51-.Ltmp50 # Record length +.Ltmp50: + .short 4360 # Record kind: S_UDT + .long 4103 # Type + .asciz "Class1" + .p2align 2 +.Ltmp51: + .short .Ltmp53-.Ltmp52 # Record length +.Ltmp52: + .short 4360 # Record kind: S_UDT + .long 4110 # Type + .asciz "Namespace2::Class2" + .p2align 2 +.Ltmp53: +.Ltmp49: + .p2align 2 + .cv_filechecksums # File index to string table offset subsection + .cv_stringtable # String table + .long 241 + .long .Ltmp55-.Ltmp54 # Subsection size +.Ltmp54: + .short .Ltmp57-.Ltmp56 # Record length +.Ltmp56: + .short 4428 # Record kind: S_BUILDINFO + .long 4124 # LF_BUILDINFO index + .p2align 2 +.Ltmp57: +.Ltmp55: + .p2align 2 + .section .debug$T,"dr" + .p2align 2 + .long 4 # Debug section magic + # StringId (0x1000) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "Namespace1" # StringData + .byte 241 + # ArgList (0x1001) + .short 0xa # Record length + .short 0x1201 # Record kind: LF_ARGLIST + .long 0x1 # NumArgs + .long 0x74 # Argument: int + # Procedure (0x1002) + .short 0xe # Record length + .short 0x1008 # Record kind: LF_PROCEDURE + .long 0x74 # ReturnType: int + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + # FuncId (0x1003) + .short 0xe # Record length + .short 0x1601 # Record kind: LF_FUNC_ID + .long 0x1000 # ParentScope: Namespace1 + .long 0x1002 # FunctionType: int (int) + .asciz "foo" # Name + # Class (0x1004) + .short 0x2a # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x0 # MemberCount + .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) + .long 0x0 # FieldList + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x0 # SizeOf + .asciz "Class1" # Name + .asciz ".?AVClass1@@" # LinkageName + .byte 242 + .byte 241 + # MemberFunction (0x1005) + .short 0x1a # Record length + .short 0x1009 # Record kind: LF_MFUNCTION + .long 0x74 # ReturnType: int + .long 0x1004 # ClassType: Class1 + .long 0x0 # ThisType + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + .long 0x0 # ThisAdjustment + # FieldList (0x1006) + .short 0xe # Record length + .short 0x1203 # Record kind: LF_FIELDLIST + .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) + .short 0xb # Attrs: Public, Static + .long 0x1005 # Type: int Class1::(int) + .asciz "bar" # Name + # Class (0x1007) + .short 0x2a # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x1 # MemberCount + .short 0x200 # Properties ( HasUniqueName (0x200) ) + .long 0x1006 # FieldList: + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x1 # SizeOf + .asciz "Class1" # Name + .asciz ".?AVClass1@@" # LinkageName + .byte 242 + .byte 241 + # StringId (0x1008) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp/./b.h" # StringData + .byte 241 + # UdtSourceLine (0x1009) + .short 0xe # Record length + .short 0x1606 # Record kind: LF_UDT_SRC_LINE + .long 0x1007 # UDT: Class1 + .long 0x1008 # SourceFile: /tmp/./b.h + .long 0x2 # LineNumber + # MemberFuncId (0x100A) + .short 0xe # Record length + .short 0x1602 # Record kind: LF_MFUNC_ID + .long 0x1004 # ClassType: Class1 + .long 0x1005 # FunctionType: int Class1::(int) + .asciz "bar" # Name + # Class (0x100B) + .short 0x42 # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x0 # MemberCount + .short 0x280 # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) ) + .long 0x0 # FieldList + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x0 # SizeOf + .asciz "Namespace2::Class2" # Name + .asciz ".?AVClass2@Namespace2@@" # LinkageName + .byte 243 + .byte 242 + .byte 241 + # MemberFunction (0x100C) + .short 0x1a # Record length + .short 0x1009 # Record kind: LF_MFUNCTION + .long 0x74 # ReturnType: int + .long 0x100b # ClassType: Namespace2::Class2 + .long 0x0 # ThisType + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x1 # NumParameters + .long 0x1001 # ArgListType: (int) + .long 0x0 # ThisAdjustment + # FieldList (0x100D) + .short 0x12 # Record length + .short 0x1203 # Record kind: LF_FIELDLIST + .short 0x1511 # Member kind: OneMethod ( LF_ONEMETHOD ) + .short 0xb # Attrs: Public, Static + .long 0x100c # Type: int Namespace2::Class2::(int) + .asciz "func" # Name + .byte 243 + .byte 242 + .byte 241 + # Class (0x100E) + .short 0x42 # Record length + .short 0x1504 # Record kind: LF_CLASS + .short 0x1 # MemberCount + .short 0x200 # Properties ( HasUniqueName (0x200) ) + .long 0x100d # FieldList: + .long 0x0 # DerivedFrom + .long 0x0 # VShape + .short 0x1 # SizeOf + .asciz "Namespace2::Class2" # Name + .asciz ".?AVClass2@Namespace2@@" # LinkageName + .byte 243 + .byte 242 + .byte 241 + # StringId (0x100F) + .short 0x12 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp/./c.h" # StringData + .byte 241 + # UdtSourceLine (0x1010) + .short 0xe # Record length + .short 0x1606 # Record kind: LF_UDT_SRC_LINE + .long 0x100e # UDT: Namespace2::Class2 + .long 0x100f # SourceFile: /tmp/./c.h + .long 0x2 # LineNumber + # MemberFuncId (0x1011) + .short 0x12 # Record length + .short 0x1602 # Record kind: LF_MFUNC_ID + .long 0x100b # ClassType: Namespace2::Class2 + .long 0x100c # FunctionType: int Namespace2::Class2::(int) + .asciz "func" # Name + .byte 243 + .byte 242 + .byte 241 + # Pointer (0x1012) + .short 0xa # Record length + .short 0x1002 # Record kind: LF_POINTER + .long 0x670 # PointeeType: char* + .long 0x1000c # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ] + # ArgList (0x1013) + .short 0xe # Record length + .short 0x1201 # Record kind: LF_ARGLIST + .long 0x2 # NumArgs + .long 0x74 # Argument: int + .long 0x1012 # Argument: char** + # Procedure (0x1014) + .short 0xe # Record length + .short 0x1008 # Record kind: LF_PROCEDURE + .long 0x74 # ReturnType: int + .byte 0x0 # CallingConvention: NearC + .byte 0x0 # FunctionOptions + .short 0x2 # NumParameters + .long 0x1013 # ArgListType: (int, char**) + # FuncId (0x1015) + .short 0x12 # Record length + .short 0x1601 # Record kind: LF_FUNC_ID + .long 0x0 # ParentScope + .long 0x1014 # FunctionType: int (int, char**) + .asciz "main" # Name + .byte 243 + .byte 242 + .byte 241 + # Modifier (0x1016) + .short 0xa # Record length + .short 0x1001 # Record kind: LF_MODIFIER + .long 0x74 # ModifiedType: int + .short 0x2 # Modifiers ( Volatile (0x2) ) + .byte 242 + .byte 241 + # StringId (0x1017) + .short 0xe # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/tmp" # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x1018) + .short 0xe # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "a.cpp" # StringData + .byte 242 + .byte 241 + # StringId (0x1019) + .short 0xa # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .byte 0 # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x101A) + .short 0x4e # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData + .byte 243 + .byte 242 + .byte 241 + # StringId (0x101B) + .short 0x9f6 # Record length + .short 0x1605 # Record kind: LF_STRING_ID + .long 0x0 # Id + .asciz "\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData + .byte 242 + .byte 241 + # BuildInfo (0x101C) + .short 0x1a # Record length + .short 0x1603 # Record kind: LF_BUILDINFO + .short 0x5 # NumArgs + .long 0x1017 # Argument: /tmp + .long 0x101a # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang + .long 0x1018 # Argument: a.cpp + .long 0x1019 # Argument + .long 0x101b # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++" + .byte 242 + .byte 241 + .addrsig diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit index eab5061dafbd..2291c7c45271 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit @@ -1,7 +1,7 @@ -br set -p BP_bar -f inline_sites_live.cpp -br set -p BP_foo -f inline_sites_live.cpp -run -expression param -continue -expression param -expression local +br set -p BP_bar -f inline_sites_live.cpp +br set -p BP_foo -f inline_sites_live.cpp +run +expression param +continue +expression param +expression local diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit index feda74856757..ad080da24dab 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit @@ -1,35 +1,35 @@ -image lookup -a 0x140001000 -v -image lookup -a 0x140001003 -v -image lookup -a 0x140001006 -v - -image lookup -a 0x140001011 -v -image lookup -a 0x140001017 -v -image lookup -a 0x140001019 -v -image lookup -a 0x14000101e -v -image lookup -a 0x14000102c -v - -image lookup -a 0x140001031 -v -image lookup -a 0x140001032 -v -image lookup -a 0x140001033 -v -image lookup -a 0x140001034 -v -image lookup -a 0x140001035 -v -image lookup -a 0x140001036 -v -image lookup -a 0x140001037 -v -image lookup -a 0x14000103b -v -image lookup -a 0x14000103d -v -image lookup -a 0x14000103f -v -image lookup -a 0x140001041 -v -image lookup -a 0x140001043 -v -image lookup -a 0x140001045 -v -image lookup -a 0x140001046 -v -image lookup -a 0x140001047 -v -image lookup -a 0x140001048 -v -image lookup -a 0x140001049 -v -image lookup -a 0x14000104a -v -image lookup -a 0x14000104b -v -image lookup -a 0x14000104c -v -image lookup -a 0x14000104e -v -image lookup -a 0x14000104f -v -image lookup -a 0x140001050 -v -image lookup -a 0x140001051 -v -exit +image lookup -a 0x140001000 -v +image lookup -a 0x140001003 -v +image lookup -a 0x140001006 -v + +image lookup -a 0x140001011 -v +image lookup -a 0x140001017 -v +image lookup -a 0x140001019 -v +image lookup -a 0x14000101e -v +image lookup -a 0x14000102c -v + +image lookup -a 0x140001031 -v +image lookup -a 0x140001032 -v +image lookup -a 0x140001033 -v +image lookup -a 0x140001034 -v +image lookup -a 0x140001035 -v +image lookup -a 0x140001036 -v +image lookup -a 0x140001037 -v +image lookup -a 0x14000103b -v +image lookup -a 0x14000103d -v +image lookup -a 0x14000103f -v +image lookup -a 0x140001041 -v +image lookup -a 0x140001043 -v +image lookup -a 0x140001045 -v +image lookup -a 0x140001046 -v +image lookup -a 0x140001047 -v +image lookup -a 0x140001048 -v +image lookup -a 0x140001049 -v +image lookup -a 0x14000104a -v +image lookup -a 0x14000104b -v +image lookup -a 0x14000104c -v +image lookup -a 0x14000104e -v +image lookup -a 0x14000104f -v +image lookup -a 0x140001050 -v +image lookup -a 0x140001051 -v +exit diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit index 3f639eb2e539..afe3f2c8b943 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit @@ -1,4 +1,4 @@ -image lookup -type A -image lookup -type B - +image lookup -type A +image lookup -type B + quit \ No newline at end of file diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit index 32758f1fbc51..3dc33fd789da 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit @@ -1,2 +1,2 @@ -image lookup -a 0x40102f -v -quit +image lookup -a 0x40102f -v +quit diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp index f0fac90e5065..ca2a84de7698 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp @@ -113,9 +113,9 @@ auto incomplete = &three; // CHECK: |-CXXRecordDecl {{.*}} union U // CHECK: |-EnumDecl {{.*}} E // CHECK: |-CXXRecordDecl {{.*}} struct S -// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)' -// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)' -// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)' +// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)' +// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)' +// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)' // CHECK: |-VarDecl {{.*}} d 'C (*)(const volatile U *, const volatile E &, const volatile S &&)' // CHECK: |-CXXRecordDecl {{.*}} struct B // CHECK: | `-CXXRecordDecl {{.*}} struct A @@ -125,14 +125,14 @@ auto incomplete = &three; // CHECK: | | `-CXXRecordDecl {{.*}} struct S // CHECK: | `-NamespaceDecl {{.*}} B // CHECK: | `-CXXRecordDecl {{.*}} struct S -// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)' -// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)' +// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)' +// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)' // CHECK: |-VarDecl {{.*}} g 'B::A::S *(*)(A::C::S &, A::B::S *)' // CHECK: |-CXXRecordDecl {{.*}} struct TC // CHECK: |-CXXRecordDecl {{.*}} struct TC> // CHECK: |-CXXRecordDecl {{.*}} struct TC // CHECK: |-CXXRecordDecl {{.*}} struct TC -// CHECK: |-VarDecl {{.*}} h 'TC (*)(TC, TC>, TC)' +// CHECK: |-VarDecl {{.*}} h 'TC (*)(TC, TC>, TC)' // CHECK: |-VarDecl {{.*}} i 'A::B::S (*)()' // CHECK: |-CXXRecordDecl {{.*}} struct Incomplete // CHECK: `-VarDecl {{.*}} incomplete 'Incomplete *(*)(Incomplete **, const Incomplete *)' diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp index 402982726965..767149ea18c4 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp @@ -1,34 +1,34 @@ -// clang-format off -// REQUIRES: system-windows - -// RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ -// RUN: %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s - -void use(int) {} - -void __attribute__((always_inline)) bar(int param) { - use(param); // BP_bar -} - -void __attribute__((always_inline)) foo(int param) { - int local = param+1; - bar(local); - use(param); - use(local); // BP_foo -} - -int main(int argc, char** argv) { - foo(argc); -} - -// CHECK: * thread #1, stop reason = breakpoint 1 -// CHECK-NEXT: frame #0: {{.*}}`main [inlined] bar(param=2) -// CHECK: (lldb) expression param -// CHECK-NEXT: (int) $0 = 2 -// CHECK: * thread #1, stop reason = breakpoint 2 -// CHECK-NEXT: frame #0: {{.*}}`main [inlined] foo(param=1) -// CHECK: (lldb) expression param -// CHECK-NEXT: (int) $1 = 1 -// CHECK-NEXT: (lldb) expression local -// CHECK-NEXT: (int) $2 = 2 +// clang-format off +// REQUIRES: system-windows + +// RUN: %build -o %t.exe -- %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s + +void use(int) {} + +void __attribute__((always_inline)) bar(int param) { + use(param); // BP_bar +} + +void __attribute__((always_inline)) foo(int param) { + int local = param+1; + bar(local); + use(param); + use(local); // BP_foo +} + +int main(int argc, char** argv) { + foo(argc); +} + +// CHECK: * thread #1, stop reason = breakpoint 1 +// CHECK-NEXT: frame #0: {{.*}}`main [inlined] bar(param=2) +// CHECK: (lldb) expression param +// CHECK-NEXT: (int) $0 = 2 +// CHECK: * thread #1, stop reason = breakpoint 2 +// CHECK-NEXT: frame #0: {{.*}}`main [inlined] foo(param=1) +// CHECK: (lldb) expression param +// CHECK-NEXT: (int) $1 = 1 +// CHECK-NEXT: (lldb) expression local +// CHECK-NEXT: (int) $2 = 2 diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp index cd5bbfc30fa0..f3aea8115f38 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp @@ -1,46 +1,46 @@ -// clang-format off - -// RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ -// RUN: %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s - -class B; -class A { -public: - static const A constA; - static A a; - static B b; - int val = 1; -}; -class B { -public: - static A a; - int val = 2; -}; -A varA; -B varB; -const A A::constA = varA; -A A::a = varA; -B A::b = varB; -A B::a = varA; - -int main(int argc, char **argv) { - return varA.val + varB.val; -} - -// CHECK: image lookup -type A -// CHECK-NEXT: 1 match found in {{.*}}.exe -// CHECK-NEXT: compiler_type = "class A { -// CHECK-NEXT: static const A constA; -// CHECK-NEXT: static A a; -// CHECK-NEXT: static B b; -// CHECK-NEXT: public: -// CHECK-NEXT: int val; -// CHECK-NEXT: }" -// CHECK: image lookup -type B -// CHECK-NEXT: 1 match found in {{.*}}.exe -// CHECK-NEXT: compiler_type = "class B { -// CHECK-NEXT: static A a; -// CHECK-NEXT: public: -// CHECK-NEXT: int val; -// CHECK-NEXT: }" +// clang-format off + +// RUN: %build -o %t.exe -- %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s + +class B; +class A { +public: + static const A constA; + static A a; + static B b; + int val = 1; +}; +class B { +public: + static A a; + int val = 2; +}; +A varA; +B varB; +const A A::constA = varA; +A A::a = varA; +B A::b = varB; +A B::a = varA; + +int main(int argc, char **argv) { + return varA.val + varB.val; +} + +// CHECK: image lookup -type A +// CHECK-NEXT: 1 match found in {{.*}}.exe +// CHECK-NEXT: compiler_type = "class A { +// CHECK-NEXT: static const A constA; +// CHECK-NEXT: static A a; +// CHECK-NEXT: static B b; +// CHECK-NEXT: public: +// CHECK-NEXT: int val; +// CHECK-NEXT: }" +// CHECK: image lookup -type B +// CHECK-NEXT: 1 match found in {{.*}}.exe +// CHECK-NEXT: compiler_type = "class B { +// CHECK-NEXT: static A a; +// CHECK-NEXT: public: +// CHECK-NEXT: int val; +// CHECK-NEXT: }" diff --git a/lldb/unittests/Breakpoint/CMakeLists.txt b/lldb/unittests/Breakpoint/CMakeLists.txt index db985bc82dc5..757c2da1a4d9 100644 --- a/lldb/unittests/Breakpoint/CMakeLists.txt +++ b/lldb/unittests/Breakpoint/CMakeLists.txt @@ -1,10 +1,10 @@ -add_lldb_unittest(LLDBBreakpointTests - BreakpointIDTest.cpp - WatchpointAlgorithmsTests.cpp - - LINK_LIBS - lldbBreakpoint - lldbCore - LINK_COMPONENTS - Support - ) +add_lldb_unittest(LLDBBreakpointTests + BreakpointIDTest.cpp + WatchpointAlgorithmsTests.cpp + + LINK_LIBS + lldbBreakpoint + lldbCore + LINK_COMPONENTS + Support + ) diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp index e351db338730..c03ead400d0d 100644 --- a/llvm/benchmarks/FormatVariadicBM.cpp +++ b/llvm/benchmarks/FormatVariadicBM.cpp @@ -1,63 +1,63 @@ -//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/Support/FormatVariadic.h" -#include -#include -#include - -using namespace llvm; -using namespace std; - -// Generate a list of format strings that have `NumReplacements` replacements -// by permuting the replacements and some literal text. -static vector getFormatStrings(int NumReplacements) { - vector Components; - for (int I = 0; I < NumReplacements; I++) - Components.push_back("{" + to_string(I) + "}"); - // Intersperse these with some other literal text (_). - const string_view Literal = "____"; - for (char C : Literal) - Components.push_back(string(1, C)); - - vector Formats; - do { - string Concat; - for (const string &C : Components) - Concat += C; - Formats.emplace_back(Concat); - } while (next_permutation(Components.begin(), Components.end())); - return Formats; -} - -// Generate the set of formats to exercise outside the benchmark code. -static const vector> Formats = { - getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), - getFormatStrings(4), getFormatStrings(5), -}; - -// Benchmark formatv() for a variety of format strings and 1-5 replacements. -static void BM_FormatVariadic(benchmark::State &state) { - for (auto _ : state) { - for (const string &Fmt : Formats[0]) - formatv(Fmt.c_str(), 1).str(); - for (const string &Fmt : Formats[1]) - formatv(Fmt.c_str(), 1, 2).str(); - for (const string &Fmt : Formats[2]) - formatv(Fmt.c_str(), 1, 2, 3).str(); - for (const string &Fmt : Formats[3]) - formatv(Fmt.c_str(), 1, 2, 3, 4).str(); - for (const string &Fmt : Formats[4]) - formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); - } -} - -BENCHMARK(BM_FormatVariadic); - -BENCHMARK_MAIN(); +//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/Support/FormatVariadic.h" +#include +#include +#include + +using namespace llvm; +using namespace std; + +// Generate a list of format strings that have `NumReplacements` replacements +// by permuting the replacements and some literal text. +static vector getFormatStrings(int NumReplacements) { + vector Components; + for (int I = 0; I < NumReplacements; I++) + Components.push_back("{" + to_string(I) + "}"); + // Intersperse these with some other literal text (_). + const string_view Literal = "____"; + for (char C : Literal) + Components.push_back(string(1, C)); + + vector Formats; + do { + string Concat; + for (const string &C : Components) + Concat += C; + Formats.emplace_back(Concat); + } while (next_permutation(Components.begin(), Components.end())); + return Formats; +} + +// Generate the set of formats to exercise outside the benchmark code. +static const vector> Formats = { + getFormatStrings(1), getFormatStrings(2), getFormatStrings(3), + getFormatStrings(4), getFormatStrings(5), +}; + +// Benchmark formatv() for a variety of format strings and 1-5 replacements. +static void BM_FormatVariadic(benchmark::State &state) { + for (auto _ : state) { + for (const string &Fmt : Formats[0]) + formatv(Fmt.c_str(), 1).str(); + for (const string &Fmt : Formats[1]) + formatv(Fmt.c_str(), 1, 2).str(); + for (const string &Fmt : Formats[2]) + formatv(Fmt.c_str(), 1, 2, 3).str(); + for (const string &Fmt : Formats[3]) + formatv(Fmt.c_str(), 1, 2, 3, 4).str(); + for (const string &Fmt : Formats[4]) + formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str(); + } +} + +BENCHMARK(BM_FormatVariadic); + +BENCHMARK_MAIN(); diff --git a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp index 953d9125e11e..fa9c528424c9 100644 --- a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp +++ b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp @@ -1,50 +1,50 @@ -#include "benchmark/benchmark.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace Intrinsic; - -// Benchmark intrinsic lookup from a variety of targets. -static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) { - static const char *Builtins[] = { - "__builtin_adjust_trampoline", - "__builtin_trap", - "__builtin_arm_ttest", - "__builtin_amdgcn_cubetc", - "__builtin_amdgcn_udot2", - "__builtin_arm_stc", - "__builtin_bpf_compare", - "__builtin_HEXAGON_A2_max", - "__builtin_lasx_xvabsd_b", - "__builtin_mips_dlsa", - "__nvvm_floor_f", - "__builtin_altivec_vslb", - "__builtin_r600_read_tgid_x", - "__builtin_riscv_aes64im", - "__builtin_s390_vcksm", - "__builtin_ve_vl_pvfmksge_Mvl", - "__builtin_ia32_axor64", - "__builtin_bitrev", - }; - static const char *Targets[] = {"", "aarch64", "amdgcn", "mips", - "nvvm", "r600", "riscv"}; - - for (auto _ : state) { - for (auto Builtin : Builtins) - for (auto Target : Targets) - getIntrinsicForClangBuiltin(Target, Builtin); - } -} - -static void -BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) { - // Exercise the worst case by looking for the first builtin for a target - // that has a lot of builtins. - for (auto _ : state) - getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs"); -} - -BENCHMARK(BM_GetIntrinsicForClangBuiltin); -BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst); - -BENCHMARK_MAIN(); +#include "benchmark/benchmark.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace Intrinsic; + +// Benchmark intrinsic lookup from a variety of targets. +static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) { + static const char *Builtins[] = { + "__builtin_adjust_trampoline", + "__builtin_trap", + "__builtin_arm_ttest", + "__builtin_amdgcn_cubetc", + "__builtin_amdgcn_udot2", + "__builtin_arm_stc", + "__builtin_bpf_compare", + "__builtin_HEXAGON_A2_max", + "__builtin_lasx_xvabsd_b", + "__builtin_mips_dlsa", + "__nvvm_floor_f", + "__builtin_altivec_vslb", + "__builtin_r600_read_tgid_x", + "__builtin_riscv_aes64im", + "__builtin_s390_vcksm", + "__builtin_ve_vl_pvfmksge_Mvl", + "__builtin_ia32_axor64", + "__builtin_bitrev", + }; + static const char *Targets[] = {"", "aarch64", "amdgcn", "mips", + "nvvm", "r600", "riscv"}; + + for (auto _ : state) { + for (auto Builtin : Builtins) + for (auto Target : Targets) + getIntrinsicForClangBuiltin(Target, Builtin); + } +} + +static void +BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) { + // Exercise the worst case by looking for the first builtin for a target + // that has a lot of builtins. + for (auto _ : state) + getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs"); +} + +BENCHMARK(BM_GetIntrinsicForClangBuiltin); +BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst); + +BENCHMARK_MAIN(); diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp index 758291274675..7f3bd3bc9eb6 100644 --- a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp +++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp @@ -1,30 +1,30 @@ -//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "benchmark/benchmark.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace Intrinsic; - -static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) { - SmallVector Table; - for (auto _ : state) { - for (ID ID = 1; ID < num_intrinsics; ++ID) { - // This makes sure the vector does not keep growing, as well as after the - // first iteration does not result in additional allocations. - Table.clear(); - getIntrinsicInfoTableEntries(ID, Table); - } - } -} - -BENCHMARK(BM_GetIntrinsicInfoTableEntries); - -BENCHMARK_MAIN(); +//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmark/benchmark.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace Intrinsic; + +static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) { + SmallVector Table; + for (auto _ : state) { + for (ID ID = 1; ID < num_intrinsics; ++ID) { + // This makes sure the vector does not keep growing, as well as after the + // first iteration does not result in additional allocations. + Table.clear(); + getIntrinsicInfoTableEntries(ID, Table); + } + } +} + +BENCHMARK(BM_GetIntrinsicInfoTableEntries); + +BENCHMARK_MAIN(); diff --git a/llvm/docs/_static/LoopOptWG_invite.ics b/llvm/docs/_static/LoopOptWG_invite.ics index 7c92e4048cc3..65597d90a9c8 100644 --- a/llvm/docs/_static/LoopOptWG_invite.ics +++ b/llvm/docs/_static/LoopOptWG_invite.ics @@ -1,80 +1,80 @@ -BEGIN:VCALENDAR -PRODID:-//Google Inc//Google Calendar 70.9054//EN -VERSION:2.0 -CALSCALE:GREGORIAN -METHOD:PUBLISH -X-WR-CALNAME:LLVM Loop Optimization Discussion -X-WR-TIMEZONE:Europe/Berlin -BEGIN:VTIMEZONE -TZID:America/New_York -X-LIC-LOCATION:America/New_York -BEGIN:DAYLIGHT -TZOFFSETFROM:-0500 -TZOFFSETTO:-0400 -TZNAME:EDT -DTSTART:19700308T020000 -RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU -END:DAYLIGHT -BEGIN:STANDARD -TZOFFSETFROM:-0400 -TZOFFSETTO:-0500 -TZNAME:EST -DTSTART:19701101T020000 -RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU -END:STANDARD -END:VTIMEZONE -BEGIN:VEVENT -DTSTART;TZID=America/New_York:20240904T110000 -DTEND;TZID=America/New_York:20240904T120000 -RRULE:FREQ=MONTHLY;BYDAY=1WE -DTSTAMP:20240821T160951Z -UID:58h3f0kd3aooohmeii0johh23c@google.com -X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg -CREATED:20240821T151507Z -DESCRIPTION:LLVM Loop Optimization Discussion
Video call link:
https://meet.google.c - om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB - 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ - :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ - nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) - +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm - z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp - ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n - -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ - :~:~::~:~::- -LAST-MODIFIED:20240821T160941Z -SEQUENCE:0 -STATUS:CONFIRMED -SUMMARY:LLVM Loop Optimization Discussion -TRANSP:OPAQUE -END:VEVENT -BEGIN:VEVENT -DTSTART;TZID=America/New_York:20240904T110000 -DTEND;TZID=America/New_York:20240904T120000 -DTSTAMP:20240821T160951Z -UID:58h3f0kd3aooohmeii0johh23c@google.com -X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg -RECURRENCE-ID;TZID=America/New_York:20240904T110000 -CREATED:20240821T151507Z -DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c - om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB - 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ - :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ - nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) - +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm - z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp - ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n - -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ - :~:~::~:~::- -LAST-MODIFIED:20240821T160941Z -SEQUENCE:0 -STATUS:CONFIRMED -SUMMARY:LLVM Loop Optimization Discussion -TRANSP:OPAQUE -END:VEVENT -END:VCALENDAR +BEGIN:VCALENDAR +PRODID:-//Google Inc//Google Calendar 70.9054//EN +VERSION:2.0 +CALSCALE:GREGORIAN +METHOD:PUBLISH +X-WR-CALNAME:LLVM Loop Optimization Discussion +X-WR-TIMEZONE:Europe/Berlin +BEGIN:VTIMEZONE +TZID:America/New_York +X-LIC-LOCATION:America/New_York +BEGIN:DAYLIGHT +TZOFFSETFROM:-0500 +TZOFFSETTO:-0400 +TZNAME:EDT +DTSTART:19700308T020000 +RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU +END:DAYLIGHT +BEGIN:STANDARD +TZOFFSETFROM:-0400 +TZOFFSETTO:-0500 +TZNAME:EST +DTSTART:19701101T020000 +RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU +END:STANDARD +END:VTIMEZONE +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +RRULE:FREQ=MONTHLY;BYDAY=1WE +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +RECURRENCE-ID;TZID=America/New_York:20240904T110000 +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +END:VCALENDAR diff --git a/llvm/lib/Support/rpmalloc/CACHE.md b/llvm/lib/Support/rpmalloc/CACHE.md index 645093026deb..052320baf532 100644 --- a/llvm/lib/Support/rpmalloc/CACHE.md +++ b/llvm/lib/Support/rpmalloc/CACHE.md @@ -1,19 +1,19 @@ -# Thread caches -rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc. - -The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped. - -The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM. - -The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting. - -![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image) -![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image) - -For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). - -As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases. - -The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount. - -The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase. +# Thread caches +rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc. + +The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped. + +The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM. + +The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting. + +![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image) +![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image) + +For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). + +As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases. + +The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount. + +The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase. diff --git a/llvm/lib/Support/rpmalloc/README.md b/llvm/lib/Support/rpmalloc/README.md index 2233df9da42d..916bca0118d8 100644 --- a/llvm/lib/Support/rpmalloc/README.md +++ b/llvm/lib/Support/rpmalloc/README.md @@ -1,220 +1,220 @@ -# rpmalloc - General Purpose Memory Allocator -This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. -This is a fork of rpmalloc 1.4.5. - -Platforms currently supported: - -- Windows -- MacOS -- iOS -- Linux -- Android -- Haiku - -The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size. - -This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license. - -# Performance -We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment. - -Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments. - -https://github.com/mjansson/rpmalloc-benchmark - -Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used. - -![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image) - -The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file. - -Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines. - -# Required functions - -Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point. - -Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case. - -# Using -The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads: - -__rpmalloc_initialize__ : Call at process start to initialize the allocator - -__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity. - -__rpmalloc_finalize__: Call at process exit to finalize the allocator - -__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator - -__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache - -__rpmalloc_config__: Get the current runtime configuration of the allocator - -Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code. - -If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include ` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution. - -For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1. - -# Building -To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides. - -The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms. - -The latest stable release is available in the master branch. For latest development code, use the develop branch. - -# Cache configuration options -Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. - -__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead. - -__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS. - -__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes. - -__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache. - -__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache). - -__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class. - -# Other configuration options -Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes. - -Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations. - -Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. - -To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1. - -To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB. - -To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled. - -# Huge pages -The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done. - -# Quick overview -The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages. - -# Implementation details -The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits). - -Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes. - -Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested. - -Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans. - -Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span. - -Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans. - -# Memory mapping -By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes. - -The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size. - -Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two. - -To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages). - -On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool. - -# Span breaking -Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually. - -A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size). - -If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range. - -# Memory fragmentation -There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class. - -However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span. - -rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span. - -# First class heaps -rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time. - -# Producer-consumer scenario -Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads. - -# Best case scenarios -Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance. - -Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis. - -# Worst case scenarios -Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes. - -Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead). - -# Caveats -VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space. - -All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__. - -To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks. - -# Other languages - -[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs) - -[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp) - -# License - -This is free and unencumbered software released into the public domain. - -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to - - -You can also use this software under the MIT license if public domain is -not recognized in your country - - -The MIT License (MIT) - -Copyright (c) 2017 Mattias Jansson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +# rpmalloc - General Purpose Memory Allocator +This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. +This is a fork of rpmalloc 1.4.5. + +Platforms currently supported: + +- Windows +- MacOS +- iOS +- Linux +- Android +- Haiku + +The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size. + +This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license. + +# Performance +We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment. + +Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments. + +https://github.com/mjansson/rpmalloc-benchmark + +Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used. + +![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image) + +The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file. + +Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines. + +# Required functions + +Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point. + +Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case. + +# Using +The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads: + +__rpmalloc_initialize__ : Call at process start to initialize the allocator + +__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity. + +__rpmalloc_finalize__: Call at process exit to finalize the allocator + +__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator + +__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache + +__rpmalloc_config__: Get the current runtime configuration of the allocator + +Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code. + +If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include ` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution. + +For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1. + +# Building +To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides. + +The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms. + +The latest stable release is available in the master branch. For latest development code, use the develop branch. + +# Cache configuration options +Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control. + +__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead. + +__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS. + +__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes. + +__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache. + +__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache). + +__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class. + +# Other configuration options +Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes. + +Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations. + +Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. + +To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1. + +To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB. + +To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled. + +# Huge pages +The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done. + +# Quick overview +The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages. + +# Implementation details +The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits). + +Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes. + +Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested. + +Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans. + +Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span. + +Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans. + +# Memory mapping +By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes. + +The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size. + +Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two. + +To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages). + +On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool. + +# Span breaking +Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually. + +A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size). + +If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range. + +# Memory fragmentation +There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class. + +However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span. + +rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span. + +# First class heaps +rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time. + +# Producer-consumer scenario +Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads. + +# Best case scenarios +Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance. + +Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis. + +# Worst case scenarios +Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes. + +Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead). + +# Caveats +VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space. + +All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__. + +To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks. + +# Other languages + +[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs) + +[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp) + +# License + +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + + +You can also use this software under the MIT license if public domain is +not recognized in your country + + +The MIT License (MIT) + +Copyright (c) 2017 Mattias Jansson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/llvm/lib/Support/rpmalloc/malloc.c b/llvm/lib/Support/rpmalloc/malloc.c index 59e13aab3ef7..3fcfe848250c 100644 --- a/llvm/lib/Support/rpmalloc/malloc.c +++ b/llvm/lib/Support/rpmalloc/malloc.c @@ -1,724 +1,724 @@ -//===------------------------ malloc.c ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -// -// This file provides overrides for the standard library malloc entry points for -// C and new/delete operators for C++ It also provides automatic -// initialization/finalization of process and threads -// -//===----------------------------------------------------------------------===// - -#if defined(__TINYC__) -#include -#endif - -#ifndef ARCH_64BIT -#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) -#define ARCH_64BIT 1 -_Static_assert(sizeof(size_t) == 8, "Data type size mismatch"); -_Static_assert(sizeof(void *) == 8, "Data type size mismatch"); -#else -#define ARCH_64BIT 0 -_Static_assert(sizeof(size_t) == 4, "Data type size mismatch"); -_Static_assert(sizeof(void *) == 4, "Data type size mismatch"); -#endif -#endif - -#if (defined(__GNUC__) || defined(__clang__)) -#pragma GCC visibility push(default) -#endif - -#define USE_IMPLEMENT 1 -#define USE_INTERPOSE 0 -#define USE_ALIAS 0 - -#if defined(__APPLE__) -#undef USE_INTERPOSE -#define USE_INTERPOSE 1 - -typedef struct interpose_t { - void *new_func; - void *orig_func; -} interpose_t; - -#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf} -#define MAC_INTERPOSE_SINGLE(newf, oldf) \ - __attribute__((used)) static const interpose_t macinterpose##newf##oldf \ - __attribute__((section("__DATA, __interpose"))) = \ - MAC_INTERPOSE_PAIR(newf, oldf) - -#endif - -#if !defined(_WIN32) && !defined(__APPLE__) -#undef USE_IMPLEMENT -#undef USE_ALIAS -#define USE_IMPLEMENT 0 -#define USE_ALIAS 1 -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4100) -#undef malloc -#undef free -#undef calloc -#define RPMALLOC_RESTRICT __declspec(restrict) -#else -#define RPMALLOC_RESTRICT -#endif - -#if ENABLE_OVERRIDE - -typedef struct rp_nothrow_t { - int __dummy; -} rp_nothrow_t; - -#if USE_IMPLEMENT - -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) { - return rpmalloc(size); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count, - size_t size) { - return rpcalloc(count, size); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr, - size_t size) { - return rprealloc(ptr, size); -} -extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) { - return rprealloc(ptr, size); -} -extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment, - size_t size) { - return rpaligned_alloc(alignment, size); -} -extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) { - return rpmemalign(alignment, size); -} -extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment, - size_t size) { - return rpposix_memalign(memptr, alignment, size); -} -extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); } -extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); } -extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) { - return rpmalloc_usable_size(ptr); -} - -#ifdef _WIN32 -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) { - return rpmalloc(size); -} -extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); } -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count, - size_t size) { - return rpcalloc(count, size); -} -extern inline size_t RPMALLOC_CDECL _msize(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) { - return rpmalloc_usable_size(ptr); -} -extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL -_realloc_base(void *ptr, size_t size) { - return rprealloc(ptr, size); -} -#endif - -#ifdef _WIN32 -// For Windows, #include in one source file to get the C++ operator -// overrides implemented in your module -#else -// Overload the C++ operators using the mangled names -// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators -// delete and delete[] -#define RPDEFVIS __attribute__((visibility("default"))) -extern void _ZdlPv(void *p); -void RPDEFVIS _ZdlPv(void *p) { rpfree(p); } -extern void _ZdaPv(void *p); -void RPDEFVIS _ZdaPv(void *p) { rpfree(p); } -#if ARCH_64BIT -// 64-bit operators new and new[], normal and aligned -extern void *_Znwm(uint64_t size); -void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); } -extern void *_Znam(uint64_t size); -void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); } -extern void *_Znwmm(uint64_t size, uint64_t align); -void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_Znamm(uint64_t size, uint64_t align); -void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align); -void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align); -void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -// 64-bit operators sized delete and delete[], normal and aligned -extern void _ZdlPvm(void *p, uint64_t size); -void RPDEFVIS _ZdlPvm(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdaPvm(void *p, uint64_t size); -void RPDEFVIS _ZdaPvm(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdlPvSt11align_val_t(void *p, uint64_t align); -void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdaPvSt11align_val_t(void *p, uint64_t align); -void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); -void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(align); -} -extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); -void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(align); -} -#else -// 32-bit operators new and new[], normal and aligned -extern void *_Znwj(uint32_t size); -void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); } -extern void *_Znaj(uint32_t size); -void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); } -extern void *_Znwjj(uint32_t size, uint32_t align); -void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) { - return rpaligned_alloc(align, size); -} -extern void *_Znajj(uint32_t size, uint32_t align); -void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwjSt11align_val_t(size_t size, size_t align); -void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnajSt11align_val_t(size_t size, size_t align); -void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); -void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t); -void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -// 32-bit operators sized delete and delete[], normal and aligned -extern void _ZdlPvj(void *p, uint64_t size); -void RPDEFVIS _ZdlPvj(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdaPvj(void *p, uint64_t size); -void RPDEFVIS _ZdaPvj(void *p, uint64_t size) { - rpfree(p); - (void)sizeof(size); -} -extern void _ZdlPvSt11align_val_t(void *p, uint32_t align); -void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdaPvSt11align_val_t(void *p, uint32_t align); -void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) { - rpfree(p); - (void)sizeof(align); -} -extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); -void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(a); -} -extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); -void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { - rpfree(p); - (void)sizeof(size); - (void)sizeof(a); -} -#endif -#endif -#endif - -#if USE_INTERPOSE || USE_ALIAS - -static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) { - (void)sizeof(t); - return rpmalloc(size); -} -static void *rpaligned_alloc_reverse(size_t size, size_t align) { - return rpaligned_alloc(align, size); -} -static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align, - rp_nothrow_t t) { - (void)sizeof(t); - return rpaligned_alloc(align, size); -} -static void rpfree_size(void *p, size_t size) { - (void)sizeof(size); - rpfree(p); -} -static void rpfree_aligned(void *p, size_t align) { - (void)sizeof(align); - rpfree(p); -} -static void rpfree_size_aligned(void *p, size_t size, size_t align) { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -#endif - -#if USE_INTERPOSE - -__attribute__((used)) static const interpose_t macinterpose_malloc[] - __attribute__((section("__DATA, __interpose"))) = { - // new and new[] - MAC_INTERPOSE_PAIR(rpmalloc, _Znwm), - MAC_INTERPOSE_PAIR(rpmalloc, _Znam), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm), - MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, - _ZnwmSt11align_val_tRKSt9nothrow_t), - MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, - _ZnamSt11align_val_tRKSt9nothrow_t), - // delete and delete[] - MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv), - MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm), - MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm), - MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t), - MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t), - // libc entry points - MAC_INTERPOSE_PAIR(rpmalloc, malloc), - MAC_INTERPOSE_PAIR(rpmalloc, calloc), - MAC_INTERPOSE_PAIR(rprealloc, realloc), - MAC_INTERPOSE_PAIR(rprealloc, reallocf), -#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15 - MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc), -#endif - MAC_INTERPOSE_PAIR(rpmemalign, memalign), - MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign), - MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree), - MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size), - MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)}; - -#endif - -#if USE_ALIAS - -#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default"))); - -// Alias the C++ operators using the mangled names -// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) - -// operators delete and delete[] -void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree) - -#if ARCH_64BIT - // 64-bit operators new and new[], normal and aligned - void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size, - uint64_t align) - RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size, - uint64_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t( - size_t size, rp_nothrow_t t) - RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t( - size_t size, - rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void - *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, - size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) void - *_ZnamSt11align_val_tRKSt9nothrow_t( - size_t size, size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) - // 64-bit operators delete and delete[], sized and aligned - void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p, - size_t n) - RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) - RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, - size_t a) - RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p, - size_t n, - size_t a) - RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t( - void *p, size_t n, size_t a) - RPALIAS(rpfree_size_aligned) -#else - // 32-bit operators new and new[], normal and aligned - void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size, - uint32_t align) - RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size, - uint32_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t( - size_t size, size_t align) - RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t( - size_t size, rp_nothrow_t t) - RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t( - size_t size, - rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void - *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, - size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) void - *_ZnajSt11align_val_tRKSt9nothrow_t( - size_t size, size_t align, - rp_nothrow_t t) - RPALIAS(rpaligned_alloc_reverse_nothrow) - // 32-bit operators delete and delete[], sized and aligned - void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p, - size_t n) - RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) - RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, - size_t a) - RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p, - size_t n, - size_t a) - RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t( - void *p, size_t n, size_t a) - RPALIAS(rpfree_size_aligned) -#endif - - void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *calloc(size_t count, size_t size) - RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size) - RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size) - RPALIAS(rpaligned_alloc) void *memalign( - size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment, - size_t size) - RPALIAS(rpposix_memalign) void free(void *ptr) - RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree) -#if defined(__ANDROID__) || defined(__FreeBSD__) - size_t - malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size) -#else - size_t - malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size) -#endif - size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size) - -#endif - - static inline size_t _rpmalloc_page_size(void) { - return _memory_page_size; -} - -extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size); - -extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#ifdef _MSC_VER - int err = SizeTMult(count, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(count, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = count * size; -#endif - return realloc(ptr, total); -} - -extern inline void *RPMALLOC_CDECL valloc(size_t size) { - get_thread_heap(); - return rpaligned_alloc(_rpmalloc_page_size(), size); -} - -extern inline void *RPMALLOC_CDECL pvalloc(size_t size) { - get_thread_heap(); - const size_t page_size = _rpmalloc_page_size(); - const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size; -#if ENABLE_VALIDATE_ARGS - if (aligned_size < size) { - errno = EINVAL; - return 0; - } -#endif - return rpaligned_alloc(_rpmalloc_page_size(), aligned_size); -} - -#endif // ENABLE_OVERRIDE - -#if ENABLE_PRELOAD - -#ifdef _WIN32 - -#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK - -extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, - DWORD reason, LPVOID reserved); - -extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, - DWORD reason, - LPVOID reserved) { - (void)sizeof(reserved); - (void)sizeof(instance); - if (reason == DLL_PROCESS_ATTACH) - rpmalloc_initialize(); - else if (reason == DLL_PROCESS_DETACH) - rpmalloc_finalize(); - else if (reason == DLL_THREAD_ATTACH) - rpmalloc_thread_initialize(); - else if (reason == DLL_THREAD_DETACH) - rpmalloc_thread_finalize(1); - return TRUE; -} - -// end BUILD_DYNAMIC_LINK -#else - -extern void _global_rpmalloc_init(void) { - rpmalloc_set_main_thread(); - rpmalloc_initialize(); -} - -#if defined(__clang__) || defined(__GNUC__) - -static void __attribute__((constructor)) initializer(void) { - _global_rpmalloc_init(); -} - -#elif defined(_MSC_VER) - -static int _global_rpmalloc_xib(void) { - _global_rpmalloc_init(); - return 0; -} - -#pragma section(".CRT$XIB", read) -__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) = - _global_rpmalloc_xib; -#if defined(_M_IX86) || defined(__i386__) -#pragma comment(linker, "/include:" \ - "__rpmalloc_module_init") -#else -#pragma comment(linker, "/include:" \ - "_rpmalloc_module_init") -#endif - -#endif - -// end !BUILD_DYNAMIC_LINK -#endif - -#else - -#include -#include -#include -#include - -extern void rpmalloc_set_main_thread(void); - -static pthread_key_t destructor_key; - -static void thread_destructor(void *); - -static void __attribute__((constructor)) initializer(void) { - rpmalloc_set_main_thread(); - rpmalloc_initialize(); - pthread_key_create(&destructor_key, thread_destructor); -} - -static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); } - -typedef struct { - void *(*real_start)(void *); - void *real_arg; -} thread_starter_arg; - -static void *thread_starter(void *argptr) { - thread_starter_arg *arg = argptr; - void *(*real_start)(void *) = arg->real_start; - void *real_arg = arg->real_arg; - rpmalloc_thread_initialize(); - rpfree(argptr); - pthread_setspecific(destructor_key, (void *)1); - return (*real_start)(real_arg); -} - -static void thread_destructor(void *value) { - (void)sizeof(value); - rpmalloc_thread_finalize(1); -} - -#ifdef __APPLE__ - -static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr, - void *(*start_routine)(void *), void *arg) { - rpmalloc_initialize(); - thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); - starter_arg->real_start = start_routine; - starter_arg->real_arg = arg; - return pthread_create(thread, attr, thread_starter, starter_arg); -} - -MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create); - -#else - -#include - -int pthread_create(pthread_t *thread, const pthread_attr_t *attr, - void *(*start_routine)(void *), void *arg) { -#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ - defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) || \ - defined(__HAIKU__) - char fname[] = "pthread_create"; -#else - char fname[] = "_pthread_create"; -#endif - void *real_pthread_create = dlsym(RTLD_NEXT, fname); - rpmalloc_thread_initialize(); - thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); - starter_arg->real_start = start_routine; - starter_arg->real_arg = arg; - return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *), - void *))real_pthread_create)(thread, attr, thread_starter, - starter_arg); -} - -#endif - -#endif - -#endif - -#if ENABLE_OVERRIDE - -#if defined(__GLIBC__) && defined(__linux__) - -void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(1) - RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size) - RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2) - RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p) - RPALIAS(rpfree) void __libc_cfree(void *p) - RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size) - RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2) - RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align, - size_t size) - RPALIAS(rpposix_memalign) - - extern void *__libc_valloc(size_t size); -extern void *__libc_pvalloc(size_t size); - -void *__libc_valloc(size_t size) { return valloc(size); } - -void *__libc_pvalloc(size_t size) { return pvalloc(size); } - -#endif - -#endif - -#if (defined(__GNUC__) || defined(__clang__)) -#pragma GCC visibility pop -#endif +//===------------------------ malloc.c ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +// +// This file provides overrides for the standard library malloc entry points for +// C and new/delete operators for C++ It also provides automatic +// initialization/finalization of process and threads +// +//===----------------------------------------------------------------------===// + +#if defined(__TINYC__) +#include +#endif + +#ifndef ARCH_64BIT +#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64) +#define ARCH_64BIT 1 +_Static_assert(sizeof(size_t) == 8, "Data type size mismatch"); +_Static_assert(sizeof(void *) == 8, "Data type size mismatch"); +#else +#define ARCH_64BIT 0 +_Static_assert(sizeof(size_t) == 4, "Data type size mismatch"); +_Static_assert(sizeof(void *) == 4, "Data type size mismatch"); +#endif +#endif + +#if (defined(__GNUC__) || defined(__clang__)) +#pragma GCC visibility push(default) +#endif + +#define USE_IMPLEMENT 1 +#define USE_INTERPOSE 0 +#define USE_ALIAS 0 + +#if defined(__APPLE__) +#undef USE_INTERPOSE +#define USE_INTERPOSE 1 + +typedef struct interpose_t { + void *new_func; + void *orig_func; +} interpose_t; + +#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf} +#define MAC_INTERPOSE_SINGLE(newf, oldf) \ + __attribute__((used)) static const interpose_t macinterpose##newf##oldf \ + __attribute__((section("__DATA, __interpose"))) = \ + MAC_INTERPOSE_PAIR(newf, oldf) + +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#undef USE_IMPLEMENT +#undef USE_ALIAS +#define USE_IMPLEMENT 0 +#define USE_ALIAS 1 +#endif + +#ifdef _MSC_VER +#pragma warning(disable : 4100) +#undef malloc +#undef free +#undef calloc +#define RPMALLOC_RESTRICT __declspec(restrict) +#else +#define RPMALLOC_RESTRICT +#endif + +#if ENABLE_OVERRIDE + +typedef struct rp_nothrow_t { + int __dummy; +} rp_nothrow_t; + +#if USE_IMPLEMENT + +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) { + return rpmalloc(size); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count, + size_t size) { + return rpcalloc(count, size); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr, + size_t size) { + return rprealloc(ptr, size); +} +extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) { + return rprealloc(ptr, size); +} +extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment, + size_t size) { + return rpaligned_alloc(alignment, size); +} +extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) { + return rpmemalign(alignment, size); +} +extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment, + size_t size) { + return rpposix_memalign(memptr, alignment, size); +} +extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); } +extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); } +extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) { + return rpmalloc_usable_size(ptr); +} + +#ifdef _WIN32 +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) { + return rpmalloc(size); +} +extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); } +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count, + size_t size) { + return rpcalloc(count, size); +} +extern inline size_t RPMALLOC_CDECL _msize(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) { + return rpmalloc_usable_size(ptr); +} +extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL +_realloc_base(void *ptr, size_t size) { + return rprealloc(ptr, size); +} +#endif + +#ifdef _WIN32 +// For Windows, #include in one source file to get the C++ operator +// overrides implemented in your module +#else +// Overload the C++ operators using the mangled names +// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators +// delete and delete[] +#define RPDEFVIS __attribute__((visibility("default"))) +extern void _ZdlPv(void *p); +void RPDEFVIS _ZdlPv(void *p) { rpfree(p); } +extern void _ZdaPv(void *p); +void RPDEFVIS _ZdaPv(void *p) { rpfree(p); } +#if ARCH_64BIT +// 64-bit operators new and new[], normal and aligned +extern void *_Znwm(uint64_t size); +void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); } +extern void *_Znam(uint64_t size); +void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); } +extern void *_Znwmm(uint64_t size, uint64_t align); +void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_Znamm(uint64_t size, uint64_t align); +void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align); +void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align); +void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +// 64-bit operators sized delete and delete[], normal and aligned +extern void _ZdlPvm(void *p, uint64_t size); +void RPDEFVIS _ZdlPvm(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdaPvm(void *p, uint64_t size); +void RPDEFVIS _ZdaPvm(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdlPvSt11align_val_t(void *p, uint64_t align); +void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdaPvSt11align_val_t(void *p, uint64_t align); +void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); +void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(align); +} +extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align); +void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(align); +} +#else +// 32-bit operators new and new[], normal and aligned +extern void *_Znwj(uint32_t size); +void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); } +extern void *_Znaj(uint32_t size); +void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); } +extern void *_Znwjj(uint32_t size, uint32_t align); +void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) { + return rpaligned_alloc(align, size); +} +extern void *_Znajj(uint32_t size, uint32_t align); +void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwjSt11align_val_t(size_t size, size_t align); +void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnajSt11align_val_t(size_t size, size_t align); +void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); +void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t); +void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +// 32-bit operators sized delete and delete[], normal and aligned +extern void _ZdlPvj(void *p, uint64_t size); +void RPDEFVIS _ZdlPvj(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdaPvj(void *p, uint64_t size); +void RPDEFVIS _ZdaPvj(void *p, uint64_t size) { + rpfree(p); + (void)sizeof(size); +} +extern void _ZdlPvSt11align_val_t(void *p, uint32_t align); +void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdaPvSt11align_val_t(void *p, uint32_t align); +void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) { + rpfree(p); + (void)sizeof(align); +} +extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); +void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(a); +} +extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align); +void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) { + rpfree(p); + (void)sizeof(size); + (void)sizeof(a); +} +#endif +#endif +#endif + +#if USE_INTERPOSE || USE_ALIAS + +static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) { + (void)sizeof(t); + return rpmalloc(size); +} +static void *rpaligned_alloc_reverse(size_t size, size_t align) { + return rpaligned_alloc(align, size); +} +static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align, + rp_nothrow_t t) { + (void)sizeof(t); + return rpaligned_alloc(align, size); +} +static void rpfree_size(void *p, size_t size) { + (void)sizeof(size); + rpfree(p); +} +static void rpfree_aligned(void *p, size_t align) { + (void)sizeof(align); + rpfree(p); +} +static void rpfree_size_aligned(void *p, size_t size, size_t align) { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +#endif + +#if USE_INTERPOSE + +__attribute__((used)) static const interpose_t macinterpose_malloc[] + __attribute__((section("__DATA, __interpose"))) = { + // new and new[] + MAC_INTERPOSE_PAIR(rpmalloc, _Znwm), + MAC_INTERPOSE_PAIR(rpmalloc, _Znam), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm), + MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, + _ZnwmSt11align_val_tRKSt9nothrow_t), + MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, + _ZnamSt11align_val_tRKSt9nothrow_t), + // delete and delete[] + MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv), + MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm), + MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm), + MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t), + MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t), + // libc entry points + MAC_INTERPOSE_PAIR(rpmalloc, malloc), + MAC_INTERPOSE_PAIR(rpmalloc, calloc), + MAC_INTERPOSE_PAIR(rprealloc, realloc), + MAC_INTERPOSE_PAIR(rprealloc, reallocf), +#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15 + MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc), +#endif + MAC_INTERPOSE_PAIR(rpmemalign, memalign), + MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign), + MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size), + MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)}; + +#endif + +#if USE_ALIAS + +#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default"))); + +// Alias the C++ operators using the mangled names +// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) + +// operators delete and delete[] +void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree) + +#if ARCH_64BIT + // 64-bit operators new and new[], normal and aligned + void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size, + uint64_t align) + RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size, + uint64_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t( + size_t size, rp_nothrow_t t) + RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t( + size_t size, + rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void + *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, + size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) void + *_ZnamSt11align_val_tRKSt9nothrow_t( + size_t size, size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) + // 64-bit operators delete and delete[], sized and aligned + void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p, + size_t n) + RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) + RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, + size_t a) + RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p, + size_t n, + size_t a) + RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t( + void *p, size_t n, size_t a) + RPALIAS(rpfree_size_aligned) +#else + // 32-bit operators new and new[], normal and aligned + void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size, + uint32_t align) + RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size, + uint32_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t( + size_t size, size_t align) + RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t( + size_t size, rp_nothrow_t t) + RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t( + size_t size, + rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void + *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, + size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) void + *_ZnajSt11align_val_tRKSt9nothrow_t( + size_t size, size_t align, + rp_nothrow_t t) + RPALIAS(rpaligned_alloc_reverse_nothrow) + // 32-bit operators delete and delete[], sized and aligned + void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p, + size_t n) + RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a) + RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p, + size_t a) + RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p, + size_t n, + size_t a) + RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t( + void *p, size_t n, size_t a) + RPALIAS(rpfree_size_aligned) +#endif + + void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *calloc(size_t count, size_t size) + RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size) + RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size) + RPALIAS(rpaligned_alloc) void *memalign( + size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment, + size_t size) + RPALIAS(rpposix_memalign) void free(void *ptr) + RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree) +#if defined(__ANDROID__) || defined(__FreeBSD__) + size_t + malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size) +#else + size_t + malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size) +#endif + size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size) + +#endif + + static inline size_t _rpmalloc_page_size(void) { + return _memory_page_size; +} + +extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size); + +extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#ifdef _MSC_VER + int err = SizeTMult(count, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(count, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = count * size; +#endif + return realloc(ptr, total); +} + +extern inline void *RPMALLOC_CDECL valloc(size_t size) { + get_thread_heap(); + return rpaligned_alloc(_rpmalloc_page_size(), size); +} + +extern inline void *RPMALLOC_CDECL pvalloc(size_t size) { + get_thread_heap(); + const size_t page_size = _rpmalloc_page_size(); + const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size; +#if ENABLE_VALIDATE_ARGS + if (aligned_size < size) { + errno = EINVAL; + return 0; + } +#endif + return rpaligned_alloc(_rpmalloc_page_size(), aligned_size); +} + +#endif // ENABLE_OVERRIDE + +#if ENABLE_PRELOAD + +#ifdef _WIN32 + +#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK + +extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, + DWORD reason, LPVOID reserved); + +extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance, + DWORD reason, + LPVOID reserved) { + (void)sizeof(reserved); + (void)sizeof(instance); + if (reason == DLL_PROCESS_ATTACH) + rpmalloc_initialize(); + else if (reason == DLL_PROCESS_DETACH) + rpmalloc_finalize(); + else if (reason == DLL_THREAD_ATTACH) + rpmalloc_thread_initialize(); + else if (reason == DLL_THREAD_DETACH) + rpmalloc_thread_finalize(1); + return TRUE; +} + +// end BUILD_DYNAMIC_LINK +#else + +extern void _global_rpmalloc_init(void) { + rpmalloc_set_main_thread(); + rpmalloc_initialize(); +} + +#if defined(__clang__) || defined(__GNUC__) + +static void __attribute__((constructor)) initializer(void) { + _global_rpmalloc_init(); +} + +#elif defined(_MSC_VER) + +static int _global_rpmalloc_xib(void) { + _global_rpmalloc_init(); + return 0; +} + +#pragma section(".CRT$XIB", read) +__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) = + _global_rpmalloc_xib; +#if defined(_M_IX86) || defined(__i386__) +#pragma comment(linker, "/include:" \ + "__rpmalloc_module_init") +#else +#pragma comment(linker, "/include:" \ + "_rpmalloc_module_init") +#endif + +#endif + +// end !BUILD_DYNAMIC_LINK +#endif + +#else + +#include +#include +#include +#include + +extern void rpmalloc_set_main_thread(void); + +static pthread_key_t destructor_key; + +static void thread_destructor(void *); + +static void __attribute__((constructor)) initializer(void) { + rpmalloc_set_main_thread(); + rpmalloc_initialize(); + pthread_key_create(&destructor_key, thread_destructor); +} + +static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); } + +typedef struct { + void *(*real_start)(void *); + void *real_arg; +} thread_starter_arg; + +static void *thread_starter(void *argptr) { + thread_starter_arg *arg = argptr; + void *(*real_start)(void *) = arg->real_start; + void *real_arg = arg->real_arg; + rpmalloc_thread_initialize(); + rpfree(argptr); + pthread_setspecific(destructor_key, (void *)1); + return (*real_start)(real_arg); +} + +static void thread_destructor(void *value) { + (void)sizeof(value); + rpmalloc_thread_finalize(1); +} + +#ifdef __APPLE__ + +static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { + rpmalloc_initialize(); + thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return pthread_create(thread, attr, thread_starter, starter_arg); +} + +MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create); + +#else + +#include + +int pthread_create(pthread_t *thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { +#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ + defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) || \ + defined(__HAIKU__) + char fname[] = "pthread_create"; +#else + char fname[] = "_pthread_create"; +#endif + void *real_pthread_create = dlsym(RTLD_NEXT, fname); + rpmalloc_thread_initialize(); + thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg)); + starter_arg->real_start = start_routine; + starter_arg->real_arg = arg; + return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *), + void *))real_pthread_create)(thread, attr, thread_starter, + starter_arg); +} + +#endif + +#endif + +#endif + +#if ENABLE_OVERRIDE + +#if defined(__GLIBC__) && defined(__linux__) + +void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(1) + RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size) + RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2) + RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p) + RPALIAS(rpfree) void __libc_cfree(void *p) + RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size) + RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2) + RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align, + size_t size) + RPALIAS(rpposix_memalign) + + extern void *__libc_valloc(size_t size); +extern void *__libc_pvalloc(size_t size); + +void *__libc_valloc(size_t size) { return valloc(size); } + +void *__libc_pvalloc(size_t size) { return pvalloc(size); } + +#endif + +#endif + +#if (defined(__GNUC__) || defined(__clang__)) +#pragma GCC visibility pop +#endif diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.c b/llvm/lib/Support/rpmalloc/rpmalloc.c index 0976ec8ae6af..a06d3cdb5b52 100644 --- a/llvm/lib/Support/rpmalloc/rpmalloc.c +++ b/llvm/lib/Support/rpmalloc/rpmalloc.c @@ -1,3992 +1,3992 @@ -//===---------------------- rpmalloc.c ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#include "rpmalloc.h" - -//////////// -/// -/// Build time configurable limits -/// -////// - -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wunused-macros" -#pragma clang diagnostic ignored "-Wunused-function" -#if __has_warning("-Wreserved-identifier") -#pragma clang diagnostic ignored "-Wreserved-identifier" -#endif -#if __has_warning("-Wstatic-in-inline") -#pragma clang diagnostic ignored "-Wstatic-in-inline" -#endif -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wunused-macros" -#pragma GCC diagnostic ignored "-Wunused-function" -#endif - -#if !defined(__has_builtin) -#define __has_builtin(b) 0 -#endif - -#if defined(__GNUC__) || defined(__clang__) - -#if __has_builtin(__builtin_memcpy_inline) -#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s) -#else -#define _rpmalloc_memcpy_const(x, y, s) \ - do { \ - _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ - "len must be a constant integer"); \ - memcpy(x, y, s); \ - } while (0) -#endif - -#if __has_builtin(__builtin_memset_inline) -#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s) -#else -#define _rpmalloc_memset_const(x, y, s) \ - do { \ - _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ - "len must be a constant integer"); \ - memset(x, y, s); \ - } while (0) -#endif -#else -#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s) -#define _rpmalloc_memset_const(x, y, s) memset(x, y, s) -#endif - -#if __has_builtin(__builtin_assume) -#define rpmalloc_assume(cond) __builtin_assume(cond) -#elif defined(__GNUC__) -#define rpmalloc_assume(cond) \ - do { \ - if (!__builtin_expect(cond, 0)) \ - __builtin_unreachable(); \ - } while (0) -#elif defined(_MSC_VER) -#define rpmalloc_assume(cond) __assume(cond) -#else -#define rpmalloc_assume(cond) 0 -#endif - -#ifndef HEAP_ARRAY_SIZE -//! Size of heap hashmap -#define HEAP_ARRAY_SIZE 47 -#endif -#ifndef ENABLE_THREAD_CACHE -//! Enable per-thread cache -#define ENABLE_THREAD_CACHE 1 -#endif -#ifndef ENABLE_GLOBAL_CACHE -//! Enable global cache shared between all threads, requires thread cache -#define ENABLE_GLOBAL_CACHE 1 -#endif -#ifndef ENABLE_VALIDATE_ARGS -//! Enable validation of args to public entry points -#define ENABLE_VALIDATE_ARGS 0 -#endif -#ifndef ENABLE_STATISTICS -//! Enable statistics collection -#define ENABLE_STATISTICS 0 -#endif -#ifndef ENABLE_ASSERTS -//! Enable asserts -#define ENABLE_ASSERTS 0 -#endif -#ifndef ENABLE_OVERRIDE -//! Override standard library malloc/free and new/delete entry points -#define ENABLE_OVERRIDE 0 -#endif -#ifndef ENABLE_PRELOAD -//! Support preloading -#define ENABLE_PRELOAD 0 -#endif -#ifndef DISABLE_UNMAP -//! Disable unmapping memory pages (also enables unlimited cache) -#define DISABLE_UNMAP 0 -#endif -#ifndef ENABLE_UNLIMITED_CACHE -//! Enable unlimited global cache (no unmapping until finalization) -#define ENABLE_UNLIMITED_CACHE 0 -#endif -#ifndef ENABLE_ADAPTIVE_THREAD_CACHE -//! Enable adaptive thread cache size based on use heuristics -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif -#ifndef DEFAULT_SPAN_MAP_COUNT -//! Default number of spans to map in call to map more virtual memory (default -//! values yield 4MiB here) -#define DEFAULT_SPAN_MAP_COUNT 64 -#endif -#ifndef GLOBAL_CACHE_MULTIPLIER -//! Multiplier for global cache -#define GLOBAL_CACHE_MULTIPLIER 8 -#endif - -#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE -#error Must use global cache if unmap is disabled -#endif - -#if DISABLE_UNMAP -#undef ENABLE_UNLIMITED_CACHE -#define ENABLE_UNLIMITED_CACHE 1 -#endif - -#if !ENABLE_GLOBAL_CACHE -#undef ENABLE_UNLIMITED_CACHE -#define ENABLE_UNLIMITED_CACHE 0 -#endif - -#if !ENABLE_THREAD_CACHE -#undef ENABLE_ADAPTIVE_THREAD_CACHE -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif - -#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) -#define PLATFORM_WINDOWS 1 -#define PLATFORM_POSIX 0 -#else -#define PLATFORM_WINDOWS 0 -#define PLATFORM_POSIX 1 -#endif - -/// Platform and arch specifics -#if defined(_MSC_VER) && !defined(__clang__) -#pragma warning(disable : 5105) -#ifndef FORCEINLINE -#define FORCEINLINE inline __forceinline -#endif -#define _Static_assert static_assert -#else -#ifndef FORCEINLINE -#define FORCEINLINE inline __attribute__((__always_inline__)) -#endif -#endif -#if PLATFORM_WINDOWS -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include -#if ENABLE_VALIDATE_ARGS -#include -#endif -#else -#include -#include -#include -#include -#if defined(__linux__) || defined(__ANDROID__) -#include -#if !defined(PR_SET_VMA) -#define PR_SET_VMA 0x53564d41 -#define PR_SET_VMA_ANON_NAME 0 -#endif -#endif -#if defined(__APPLE__) -#include -#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR -#include -#include -#endif -#include -#endif -#if defined(__HAIKU__) || defined(__TINYC__) -#include -#endif -#endif - -#include -#include -#include - -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -#include -static DWORD fls_key; -#endif - -#if PLATFORM_POSIX -#include -#include -#ifdef __FreeBSD__ -#include -#define MAP_HUGETLB MAP_ALIGNED_SUPER -#ifndef PROT_MAX -#define PROT_MAX(f) 0 -#endif -#else -#define PROT_MAX(f) 0 -#endif -#ifdef __sun -extern int madvise(caddr_t, size_t, int); -#endif -#ifndef MAP_UNINITIALIZED -#define MAP_UNINITIALIZED 0 -#endif -#endif -#include - -#if ENABLE_ASSERTS -#undef NDEBUG -#if defined(_MSC_VER) && !defined(_DEBUG) -#define _DEBUG -#endif -#include -#define RPMALLOC_TOSTRING_M(x) #x -#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) -#define rpmalloc_assert(truth, message) \ - do { \ - if (!(truth)) { \ - if (_memory_config.error_callback) { \ - _memory_config.error_callback(message " (" RPMALLOC_TOSTRING( \ - truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ - } else { \ - assert((truth) && message); \ - } \ - } \ - } while (0) -#else -#define rpmalloc_assert(truth, message) \ - do { \ - } while (0) -#endif -#if ENABLE_STATISTICS -#include -#endif - -////// -/// -/// Atomic access abstraction (since MSVC does not do C11 yet) -/// -////// - -#if defined(_MSC_VER) && !defined(__clang__) - -typedef volatile long atomic32_t; -typedef volatile long long atomic64_t; -typedef volatile void *atomicptr_t; - -static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; } -static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { - *dst = val; -} -static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { - return (int32_t)InterlockedIncrement(val); -} -static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { - return (int32_t)InterlockedDecrement(val); -} -static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { - return (int32_t)InterlockedExchangeAdd(val, add) + add; -} -static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, - int32_t ref) { - return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; -} -static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { - *dst = val; -} -static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; } -static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { - return (int64_t)InterlockedExchangeAdd64(val, add) + add; -} -static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { - return (void *)*src; -} -static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { - *dst = val; -} -static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { - *dst = val; -} -static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, - void *val) { - return (void *)InterlockedExchangePointer((void *volatile *)dst, val); -} -static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { - return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) == - ref) - ? 1 - : 0; -} - -#define EXPECTED(x) (x) -#define UNEXPECTED(x) (x) - -#else - -#include - -typedef volatile _Atomic(int32_t) atomic32_t; -typedef volatile _Atomic(int64_t) atomic64_t; -typedef volatile _Atomic(void *) atomicptr_t; - -static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { - return atomic_load_explicit(src, memory_order_relaxed); -} -static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { - atomic_store_explicit(dst, val, memory_order_relaxed); -} -static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { - return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; -} -static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { - return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; -} -static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { - return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; -} -static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, - int32_t ref) { - return atomic_compare_exchange_weak_explicit( - dst, &ref, val, memory_order_acquire, memory_order_relaxed); -} -static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { - atomic_store_explicit(dst, val, memory_order_release); -} -static FORCEINLINE int64_t atomic_load64(atomic64_t *val) { - return atomic_load_explicit(val, memory_order_relaxed); -} -static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { - return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; -} -static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { - return atomic_load_explicit(src, memory_order_relaxed); -} -static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { - atomic_store_explicit(dst, val, memory_order_relaxed); -} -static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { - atomic_store_explicit(dst, val, memory_order_release); -} -static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, - void *val) { - return atomic_exchange_explicit(dst, val, memory_order_acquire); -} -static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { - return atomic_compare_exchange_weak_explicit( - dst, &ref, val, memory_order_relaxed, memory_order_relaxed); -} - -#define EXPECTED(x) __builtin_expect((x), 1) -#define UNEXPECTED(x) __builtin_expect((x), 0) - -#endif - -//////////// -/// -/// Statistics related functions (evaluate to nothing when statistics not -/// enabled) -/// -////// - -#if ENABLE_STATISTICS -#define _rpmalloc_stat_inc(counter) atomic_incr32(counter) -#define _rpmalloc_stat_dec(counter) atomic_decr32(counter) -#define _rpmalloc_stat_add(counter, value) \ - atomic_add32(counter, (int32_t)(value)) -#define _rpmalloc_stat_add64(counter, value) \ - atomic_add64(counter, (int64_t)(value)) -#define _rpmalloc_stat_add_peak(counter, value, peak) \ - do { \ - int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); \ - if (_cur_count > (peak)) \ - peak = _cur_count; \ - } while (0) -#define _rpmalloc_stat_sub(counter, value) \ - atomic_add32(counter, -(int32_t)(value)) -#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ - do { \ - int32_t alloc_current = \ - atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ - if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ - heap->size_class_use[class_idx].alloc_peak = alloc_current; \ - atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ - } while (0) -#define _rpmalloc_stat_inc_free(heap, class_idx) \ - do { \ - atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ - atomic_incr32(&heap->size_class_use[class_idx].free_total); \ - } while (0) -#else -#define _rpmalloc_stat_inc(counter) \ - do { \ - } while (0) -#define _rpmalloc_stat_dec(counter) \ - do { \ - } while (0) -#define _rpmalloc_stat_add(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_add64(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_add_peak(counter, value, peak) \ - do { \ - } while (0) -#define _rpmalloc_stat_sub(counter, value) \ - do { \ - } while (0) -#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ - do { \ - } while (0) -#define _rpmalloc_stat_inc_free(heap, class_idx) \ - do { \ - } while (0) -#endif - -/// -/// Preconfigured limits and sizes -/// - -//! Granularity of a small allocation block (must be power of two) -#define SMALL_GRANULARITY 16 -//! Small granularity shift count -#define SMALL_GRANULARITY_SHIFT 4 -//! Number of small block size classes -#define SMALL_CLASS_COUNT 65 -//! Maximum size of a small block -#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) -//! Granularity of a medium allocation block -#define MEDIUM_GRANULARITY 512 -//! Medium granularity shift count -#define MEDIUM_GRANULARITY_SHIFT 9 -//! Number of medium block size classes -#define MEDIUM_CLASS_COUNT 61 -//! Total number of small + medium size classes -#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) -//! Number of large block size classes -#define LARGE_CLASS_COUNT 63 -//! Maximum size of a medium block -#define MEDIUM_SIZE_LIMIT \ - (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) -//! Maximum size of a large block -#define LARGE_SIZE_LIMIT \ - ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) -//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power -//! of two) -#define SPAN_HEADER_SIZE 128 -//! Number of spans in thread cache -#define MAX_THREAD_SPAN_CACHE 400 -//! Number of spans to transfer between thread and global cache -#define THREAD_SPAN_CACHE_TRANSFER 64 -//! Number of spans in thread cache for large spans (must be greater than -//! LARGE_CLASS_COUNT / 2) -#define MAX_THREAD_SPAN_LARGE_CACHE 100 -//! Number of spans to transfer between thread and global cache for large spans -#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 - -_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, - "Small granularity must be power of two"); -_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, - "Span header size must be power of two"); - -#if ENABLE_VALIDATE_ARGS -//! Maximum allocation size to avoid integer overflow -#undef MAX_ALLOC_SIZE -#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size) -#endif - -#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs)) -#define pointer_diff(first, second) \ - (ptrdiff_t)((const char *)(first) - (const char *)(second)) - -#define INVALID_POINTER ((void *)((uintptr_t) - 1)) - -#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT -#define SIZE_CLASS_HUGE ((uint32_t) - 1) - -//////////// -/// -/// Data types -/// -////// - -//! A memory heap, per thread -typedef struct heap_t heap_t; -//! Span of memory pages -typedef struct span_t span_t; -//! Span list -typedef struct span_list_t span_list_t; -//! Span active data -typedef struct span_active_t span_active_t; -//! Size class definition -typedef struct size_class_t size_class_t; -//! Global cache -typedef struct global_cache_t global_cache_t; - -//! Flag indicating span is the first (master) span of a split superspan -#define SPAN_FLAG_MASTER 1U -//! Flag indicating span is a secondary (sub) span of a split superspan -#define SPAN_FLAG_SUBSPAN 2U -//! Flag indicating span has blocks with increased alignment -#define SPAN_FLAG_ALIGNED_BLOCKS 4U -//! Flag indicating an unmapped master span -#define SPAN_FLAG_UNMAPPED_MASTER 8U - -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS -struct span_use_t { - //! Current number of spans used (actually used, not in cache) - atomic32_t current; - //! High water mark of spans used - atomic32_t high; -#if ENABLE_STATISTICS - //! Number of spans in deferred list - atomic32_t spans_deferred; - //! Number of spans transitioned to global cache - atomic32_t spans_to_global; - //! Number of spans transitioned from global cache - atomic32_t spans_from_global; - //! Number of spans transitioned to thread cache - atomic32_t spans_to_cache; - //! Number of spans transitioned from thread cache - atomic32_t spans_from_cache; - //! Number of spans transitioned to reserved state - atomic32_t spans_to_reserved; - //! Number of spans transitioned from reserved state - atomic32_t spans_from_reserved; - //! Number of raw memory map calls - atomic32_t spans_map_calls; -#endif -}; -typedef struct span_use_t span_use_t; -#endif - -#if ENABLE_STATISTICS -struct size_class_use_t { - //! Current number of allocations - atomic32_t alloc_current; - //! Peak number of allocations - int32_t alloc_peak; - //! Total number of allocations - atomic32_t alloc_total; - //! Total number of frees - atomic32_t free_total; - //! Number of spans in use - atomic32_t spans_current; - //! Number of spans transitioned to cache - int32_t spans_peak; - //! Number of spans transitioned to cache - atomic32_t spans_to_cache; - //! Number of spans transitioned from cache - atomic32_t spans_from_cache; - //! Number of spans transitioned from reserved state - atomic32_t spans_from_reserved; - //! Number of spans mapped - atomic32_t spans_map_calls; - int32_t unused; -}; -typedef struct size_class_use_t size_class_use_t; -#endif - -// A span can either represent a single span of memory pages with size declared -// by span_map_count configuration variable, or a set of spans in a continuous -// region, a super span. Any reference to the term "span" usually refers to both -// a single span or a super span. A super span can further be divided into -// multiple spans (or this, super spans), where the first (super)span is the -// master and subsequent (super)spans are subspans. The master span keeps track -// of how many subspans that are still alive and mapped in virtual memory, and -// once all subspans and master have been unmapped the entire superspan region -// is released and unmapped (on Windows for example, the entire superspan range -// has to be released in the same call to release the virtual memory range, but -// individual subranges can be decommitted individually to reduce physical -// memory use). -struct span_t { - //! Free list - void *free_list; - //! Total block count of size class - uint32_t block_count; - //! Size class - uint32_t size_class; - //! Index of last block initialized in free list - uint32_t free_list_limit; - //! Number of used blocks remaining when in partial state - uint32_t used_count; - //! Deferred free list - atomicptr_t free_list_deferred; - //! Size of deferred free list, or list of spans when part of a cache list - uint32_t list_size; - //! Size of a block - uint32_t block_size; - //! Flags and counters - uint32_t flags; - //! Number of spans - uint32_t span_count; - //! Total span counter for master spans - uint32_t total_spans; - //! Offset from master span for subspans - uint32_t offset_from_master; - //! Remaining span counter, for master spans - atomic32_t remaining_spans; - //! Alignment offset - uint32_t align_offset; - //! Owning heap - heap_t *heap; - //! Next span - span_t *next; - //! Previous span - span_t *prev; -}; -_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); - -struct span_cache_t { - size_t count; - span_t *span[MAX_THREAD_SPAN_CACHE]; -}; -typedef struct span_cache_t span_cache_t; - -struct span_large_cache_t { - size_t count; - span_t *span[MAX_THREAD_SPAN_LARGE_CACHE]; -}; -typedef struct span_large_cache_t span_large_cache_t; - -struct heap_size_class_t { - //! Free list of active span - void *free_list; - //! Double linked list of partially used spans with free blocks. - // Previous span pointer in head points to tail span of list. - span_t *partial_span; - //! Early level cache of fully free spans - span_t *cache; -}; -typedef struct heap_size_class_t heap_size_class_t; - -// Control structure for a heap, either a thread heap or a first class heap if -// enabled -struct heap_t { - //! Owning thread ID - uintptr_t owner_thread; - //! Free lists for each size class - heap_size_class_t size_class[SIZE_CLASS_COUNT]; -#if ENABLE_THREAD_CACHE - //! Arrays of fully freed spans, single span - span_cache_t span_cache; -#endif - //! List of deferred free spans (single linked list) - atomicptr_t span_free_deferred; - //! Number of full spans - size_t full_span_count; - //! Mapped but unused spans - span_t *span_reserve; - //! Master span for mapped but unused spans - span_t *span_reserve_master; - //! Number of mapped but unused spans - uint32_t spans_reserved; - //! Child count - atomic32_t child_count; - //! Next heap in id list - heap_t *next_heap; - //! Next heap in orphan list - heap_t *next_orphan; - //! Heap ID - int32_t id; - //! Finalization state flag - int finalize; - //! Master heap owning the memory pages - heap_t *master_heap; -#if ENABLE_THREAD_CACHE - //! Arrays of fully freed spans, large spans with > 1 span count - span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; -#endif -#if RPMALLOC_FIRST_CLASS_HEAPS - //! Double linked list of fully utilized spans with free blocks for each size - //! class. - // Previous span pointer in head points to tail span of list. - span_t *full_span[SIZE_CLASS_COUNT]; - //! Double linked list of large and huge spans allocated by this heap - span_t *large_huge_span; -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - //! Current and high water mark of spans used per span count - span_use_t span_use[LARGE_CLASS_COUNT]; -#endif -#if ENABLE_STATISTICS - //! Allocation stats per size class - size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; - //! Number of bytes transitioned thread -> global - atomic64_t thread_to_global; - //! Number of bytes transitioned global -> thread - atomic64_t global_to_thread; -#endif -}; - -// Size class for defining a block size bucket -struct size_class_t { - //! Size of blocks in this class - uint32_t block_size; - //! Number of blocks in each chunk - uint16_t block_count; - //! Class index this class is merged with - uint16_t class_idx; -}; -_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); - -struct global_cache_t { - //! Cache lock - atomic32_t lock; - //! Cache count - uint32_t count; -#if ENABLE_STATISTICS - //! Insert count - size_t insert_count; - //! Extract count - size_t extract_count; -#endif - //! Cached spans - span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; - //! Unlimited cache overflow - span_t *overflow; -}; - -//////////// -/// -/// Global data -/// -////// - -//! Default span size (64KiB) -#define _memory_default_span_size (64 * 1024) -#define _memory_default_span_size_shift 16 -#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) - -//! Initialized flag -static int _rpmalloc_initialized; -//! Main thread ID -static uintptr_t _rpmalloc_main_thread_id; -//! Configuration -static rpmalloc_config_t _memory_config; -//! Memory page size -static size_t _memory_page_size; -//! Shift to divide by page size -static size_t _memory_page_size_shift; -//! Granularity at which memory pages are mapped by OS -static size_t _memory_map_granularity; -#if RPMALLOC_CONFIGURABLE -//! Size of a span of memory pages -static size_t _memory_span_size; -//! Shift to divide by span size -static size_t _memory_span_size_shift; -//! Mask to get to start of a memory span -static uintptr_t _memory_span_mask; -#else -//! Hardwired span size -#define _memory_span_size _memory_default_span_size -#define _memory_span_size_shift _memory_default_span_size_shift -#define _memory_span_mask _memory_default_span_mask -#endif -//! Number of spans to map in each map call -static size_t _memory_span_map_count; -//! Number of spans to keep reserved in each heap -static size_t _memory_heap_reserve_count; -//! Global size classes -static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; -//! Run-time size limit of medium blocks -static size_t _memory_medium_size_limit; -//! Heap ID counter -static atomic32_t _memory_heap_id; -//! Huge page support -static int _memory_huge_pages; -#if ENABLE_GLOBAL_CACHE -//! Global span cache -static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; -#endif -//! Global reserved spans -static span_t *_memory_global_reserve; -//! Global reserved count -static size_t _memory_global_reserve_count; -//! Global reserved master -static span_t *_memory_global_reserve_master; -//! All heaps -static heap_t *_memory_heaps[HEAP_ARRAY_SIZE]; -//! Used to restrict access to mapping memory for huge pages -static atomic32_t _memory_global_lock; -//! Orphaned heaps -static heap_t *_memory_orphan_heaps; -#if RPMALLOC_FIRST_CLASS_HEAPS -//! Orphaned heaps (first class heaps) -static heap_t *_memory_first_class_orphan_heaps; -#endif -#if ENABLE_STATISTICS -//! Allocations counter -static atomic64_t _allocation_counter; -//! Deallocations counter -static atomic64_t _deallocation_counter; -//! Active heap count -static atomic32_t _memory_active_heaps; -//! Number of currently mapped memory pages -static atomic32_t _mapped_pages; -//! Peak number of concurrently mapped memory pages -static int32_t _mapped_pages_peak; -//! Number of mapped master spans -static atomic32_t _master_spans; -//! Number of unmapped dangling master spans -static atomic32_t _unmapped_master_spans; -//! Running counter of total number of mapped memory pages since start -static atomic32_t _mapped_total; -//! Running counter of total number of unmapped memory pages since start -static atomic32_t _unmapped_total; -//! Number of currently mapped memory pages in OS calls -static atomic32_t _mapped_pages_os; -//! Number of currently allocated pages in huge allocations -static atomic32_t _huge_pages_current; -//! Peak number of currently allocated pages in huge allocations -static int32_t _huge_pages_peak; -#endif - -//////////// -/// -/// Thread local heap and ID -/// -////// - -//! Current thread heap -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) -static pthread_key_t _memory_thread_heap; -#else -#ifdef _MSC_VER -#define _Thread_local __declspec(thread) -#define TLS_MODEL -#else -#ifndef __HAIKU__ -#define TLS_MODEL __attribute__((tls_model("initial-exec"))) -#else -#define TLS_MODEL -#endif -#if !defined(__clang__) && defined(__GNUC__) -#define _Thread_local __thread -#endif -#endif -static _Thread_local heap_t *_memory_thread_heap TLS_MODEL; -#endif - -static inline heap_t *get_thread_heap_raw(void) { -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - return pthread_getspecific(_memory_thread_heap); -#else - return _memory_thread_heap; -#endif -} - -//! Get the current thread heap -static inline heap_t *get_thread_heap(void) { - heap_t *heap = get_thread_heap_raw(); -#if ENABLE_PRELOAD - if (EXPECTED(heap != 0)) - return heap; - rpmalloc_initialize(); - return get_thread_heap_raw(); -#else - return heap; -#endif -} - -//! Fast thread ID -static inline uintptr_t get_thread_id(void) { -#if defined(_WIN32) - return (uintptr_t)((void *)NtCurrentTeb()); -#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) - uintptr_t tid; -#if defined(__i386__) - __asm__("movl %%gs:0, %0" : "=r"(tid) : :); -#elif defined(__x86_64__) -#if defined(__MACH__) - __asm__("movq %%gs:0, %0" : "=r"(tid) : :); -#else - __asm__("movq %%fs:0, %0" : "=r"(tid) : :); -#endif -#elif defined(__arm__) - __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); -#elif defined(__aarch64__) -#if defined(__MACH__) - // tpidr_el0 likely unused, always return 0 on iOS - __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); -#else - __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); -#endif -#else -#error This platform needs implementation of get_thread_id() -#endif - return tid; -#else -#error This platform needs implementation of get_thread_id() -#endif -} - -//! Set the current thread heap -static void set_thread_heap(heap_t *heap) { -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) - pthread_setspecific(_memory_thread_heap, heap); -#else - _memory_thread_heap = heap; -#endif - if (heap) - heap->owner_thread = get_thread_id(); -} - -//! Set main thread ID -extern void rpmalloc_set_main_thread(void); - -void rpmalloc_set_main_thread(void) { - _rpmalloc_main_thread_id = get_thread_id(); -} - -static void _rpmalloc_spin(void) { -#if defined(_MSC_VER) -#if defined(_M_ARM64) - __yield(); -#else - _mm_pause(); -#endif -#elif defined(__x86_64__) || defined(__i386__) - __asm__ volatile("pause" ::: "memory"); -#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) - __asm__ volatile("yield" ::: "memory"); -#elif defined(__powerpc__) || defined(__powerpc64__) - // No idea if ever been compiled in such archs but ... as precaution - __asm__ volatile("or 27,27,27"); -#elif defined(__sparc__) - __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); -#else - struct timespec ts = {0}; - nanosleep(&ts, 0); -#endif -} - -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -static void NTAPI _rpmalloc_thread_destructor(void *value) { -#if ENABLE_OVERRIDE - // If this is called on main thread it means rpmalloc_finalize - // has not been called and shutdown is forced (through _exit) or unclean - if (get_thread_id() == _rpmalloc_main_thread_id) - return; -#endif - if (value) - rpmalloc_thread_finalize(1); -} -#endif - -//////////// -/// -/// Low level memory map/unmap -/// -////// - -static void _rpmalloc_set_name(void *address, size_t size) { -#if defined(__linux__) || defined(__ANDROID__) - const char *name = _memory_huge_pages ? _memory_config.huge_page_name - : _memory_config.page_name; - if (address == MAP_FAILED || !name) - return; - // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails - // (e.g. invalid name) it is a no-op basically. - (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, - (uintptr_t)name); -#else - (void)sizeof(size); - (void)sizeof(address); -#endif -} - -//! Map more virtual memory -// size is number of bytes to map -// offset receives the offset in bytes from start of mapped region -// returns address to start of mapped region to use -static void *_rpmalloc_mmap(size_t size, size_t *offset) { - rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); - rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); - void *address = _memory_config.memory_map(size, offset); - if (EXPECTED(address != 0)) { - _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), - _mapped_pages_peak); - _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); - } - return address; -} - -//! Unmap virtual memory -// address is the memory address to unmap, as returned from _memory_map -// size is the number of bytes to unmap, which might be less than full region -// for a partial unmap offset is the offset in bytes to the actual mapped -// region, as set by _memory_map release is set to 0 for partial unmap, or size -// of entire range for a full unmap -static void _rpmalloc_unmap(void *address, size_t size, size_t offset, - size_t release) { - rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); - rpmalloc_assert(!release || (release >= _memory_page_size), - "Invalid unmap size"); - if (release) { - rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); - _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); - _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); - } - _memory_config.memory_unmap(address, size, offset, release); -} - -//! Default implementation to map new pages to virtual memory -static void *_rpmalloc_mmap_os(size_t size, size_t *offset) { - // Either size is a heap (a single page) or a (multiple) span - we only need - // to align spans, and only if larger than map granularity - size_t padding = ((size >= _memory_span_size) && - (_memory_span_size > _memory_map_granularity)) - ? _memory_span_size - : 0; - rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); -#if PLATFORM_WINDOWS - // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not - // allocated unless/until the virtual addresses are actually accessed" - void *ptr = VirtualAlloc(0, size + padding, - (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); - if (!ptr) { - if (_memory_config.map_fail_callback) { - if (_memory_config.map_fail_callback(size + padding)) - return _rpmalloc_mmap_os(size, offset); - } else { - rpmalloc_assert(ptr, "Failed to map virtual memory block"); - } - return 0; - } -#else - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; -#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR - int fd = (int)VM_MAKE_TAG(240U); - if (_memory_huge_pages) - fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; - void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); -#elif defined(MAP_HUGETLB) - void *ptr = mmap(0, size + padding, - PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), - (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); -#if defined(MADV_HUGEPAGE) - // In some configurations, huge pages allocations might fail thus - // we fallback to normal allocations and promote the region as transparent - // huge page - if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { - ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); - if (ptr && ptr != MAP_FAILED) { - int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); - (void)prm; - rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); - } - } -#endif - _rpmalloc_set_name(ptr, size + padding); -#elif defined(MAP_ALIGNED) - const size_t align = - (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); - void *ptr = - mmap(0, size + padding, PROT_READ | PROT_WRITE, - (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); -#elif defined(MAP_ALIGN) - caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); - void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, - (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); -#else - void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if ((ptr == MAP_FAILED) || !ptr) { - if (_memory_config.map_fail_callback) { - if (_memory_config.map_fail_callback(size + padding)) - return _rpmalloc_mmap_os(size, offset); - } else if (errno != ENOMEM) { - rpmalloc_assert((ptr != MAP_FAILED) && ptr, - "Failed to map virtual memory block"); - } - return 0; - } -#endif - _rpmalloc_stat_add(&_mapped_pages_os, - (int32_t)((size + padding) >> _memory_page_size_shift)); - if (padding) { - size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); - rpmalloc_assert(final_padding <= _memory_span_size, - "Internal failure in padding"); - rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); - rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); - ptr = pointer_offset(ptr, final_padding); - *offset = final_padding >> 3; - } - rpmalloc_assert((size < _memory_span_size) || - !((uintptr_t)ptr & ~_memory_span_mask), - "Internal failure in padding"); - return ptr; -} - -//! Default implementation to unmap pages from virtual memory -static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset, - size_t release) { - rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); - rpmalloc_assert(!release || (release >= _memory_page_size), - "Invalid unmap size"); - rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); - if (release && offset) { - offset <<= 3; - address = pointer_offset(address, -(int32_t)offset); - if ((release >= _memory_span_size) && - (_memory_span_size > _memory_map_granularity)) { - // Padding is always one span size - release += _memory_span_size; - } - } -#if !DISABLE_UNMAP -#if PLATFORM_WINDOWS - if (!VirtualFree(address, release ? 0 : size, - release ? MEM_RELEASE : MEM_DECOMMIT)) { - rpmalloc_assert(0, "Failed to unmap virtual memory block"); - } -#else - if (release) { - if (munmap(address, release)) { - rpmalloc_assert(0, "Failed to unmap virtual memory block"); - } - } else { -#if defined(MADV_FREE_REUSABLE) - int ret; - while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && - (errno == EAGAIN)) - errno = 0; - if ((ret == -1) && (errno != 0)) { -#elif defined(MADV_DONTNEED) - if (madvise(address, size, MADV_DONTNEED)) { -#elif defined(MADV_PAGEOUT) - if (madvise(address, size, MADV_PAGEOUT)) { -#elif defined(MADV_FREE) - if (madvise(address, size, MADV_FREE)) { -#else - if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { -#endif - rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); - } - } -#endif -#endif - if (release) - _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); -} - -static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, - span_t *subspan, - size_t span_count); - -//! Use global reserved spans to fulfill a memory map request (reserve size must -//! be checked by caller) -static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) { - span_t *span = _memory_global_reserve; - _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, - span, span_count); - _memory_global_reserve_count -= span_count; - if (_memory_global_reserve_count) - _memory_global_reserve = - (span_t *)pointer_offset(span, span_count << _memory_span_size_shift); - else - _memory_global_reserve = 0; - return span; -} - -//! Store the given spans as global reserve (must only be called from within new -//! heap allocation, not thread safe) -static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve, - size_t reserve_span_count) { - _memory_global_reserve_master = master; - _memory_global_reserve_count = reserve_span_count; - _memory_global_reserve = reserve; -} - -//////////// -/// -/// Span linked list management -/// -////// - -//! Add a span to double linked list at the head -static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) { - if (*head) - (*head)->prev = span; - span->next = *head; - *head = span; -} - -//! Pop head span from double linked list -static void _rpmalloc_span_double_link_list_pop_head(span_t **head, - span_t *span) { - rpmalloc_assert(*head == span, "Linked list corrupted"); - span = *head; - *head = span->next; -} - -//! Remove a span from double linked list -static void _rpmalloc_span_double_link_list_remove(span_t **head, - span_t *span) { - rpmalloc_assert(*head, "Linked list corrupted"); - if (*head == span) { - *head = span->next; - } else { - span_t *next_span = span->next; - span_t *prev_span = span->prev; - prev_span->next = next_span; - if (EXPECTED(next_span != 0)) - next_span->prev = prev_span; - } -} - -//////////// -/// -/// Span control -/// -////// - -static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span); - -static void _rpmalloc_heap_finalize(heap_t *heap); - -static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, - span_t *reserve, - size_t reserve_span_count); - -//! Declare the span to be a subspan and store distance from master span and -//! span count -static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, - span_t *subspan, - size_t span_count) { - rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), - "Span master pointer and/or flag mismatch"); - if (subspan != master) { - subspan->flags = SPAN_FLAG_SUBSPAN; - subspan->offset_from_master = - (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> - _memory_span_size_shift); - subspan->align_offset = 0; - } - subspan->span_count = (uint32_t)span_count; -} - -//! Use reserved spans to fulfill a memory map request (reserve size must be -//! checked by caller) -static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap, - size_t span_count) { - // Update the heap span reserve - span_t *span = heap->span_reserve; - heap->span_reserve = - (span_t *)pointer_offset(span, span_count * _memory_span_size); - heap->spans_reserved -= (uint32_t)span_count; - - _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, - span_count); - if (span_count <= LARGE_CLASS_COUNT) - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); - - return span; -} - -//! Get the aligned number of spans to map in based on wanted count, configured -//! mapping granularity and the page size -static size_t _rpmalloc_span_align_count(size_t span_count) { - size_t request_count = (span_count > _memory_span_map_count) - ? span_count - : _memory_span_map_count; - if ((_memory_page_size > _memory_span_size) && - ((request_count * _memory_span_size) % _memory_page_size)) - request_count += - _memory_span_map_count - (request_count % _memory_span_map_count); - return request_count; -} - -//! Setup a newly mapped span -static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count, - size_t span_count, size_t align_offset) { - span->total_spans = (uint32_t)total_span_count; - span->span_count = (uint32_t)span_count; - span->align_offset = (uint32_t)align_offset; - span->flags = SPAN_FLAG_MASTER; - atomic_store32(&span->remaining_spans, (int32_t)total_span_count); -} - -static void _rpmalloc_span_unmap(span_t *span); - -//! Map an aligned set of spans, taking configured mapping granularity and the -//! page size into account -static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap, - size_t span_count) { - // If we already have some, but not enough, reserved spans, release those to - // heap cache and map a new full set of spans. Otherwise we would waste memory - // if page size > span size (huge pages) - size_t aligned_span_count = _rpmalloc_span_align_count(span_count); - size_t align_offset = 0; - span_t *span = (span_t *)_rpmalloc_mmap( - aligned_span_count * _memory_span_size, &align_offset); - if (!span) - return 0; - _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); - _rpmalloc_stat_inc(&_master_spans); - if (span_count <= LARGE_CLASS_COUNT) - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); - if (aligned_span_count > span_count) { - span_t *reserved_spans = - (span_t *)pointer_offset(span, span_count * _memory_span_size); - size_t reserved_count = aligned_span_count - span_count; - if (heap->spans_reserved) { - _rpmalloc_span_mark_as_subspan_unless_master( - heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); - _rpmalloc_heap_cache_insert(heap, heap->span_reserve); - } - if (reserved_count > _memory_heap_reserve_count) { - // If huge pages or eager spam map count, the global reserve spin lock is - // held by caller, _rpmalloc_span_map - rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, - "Global spin lock not held as expected"); - size_t remain_count = reserved_count - _memory_heap_reserve_count; - reserved_count = _memory_heap_reserve_count; - span_t *remain_span = (span_t *)pointer_offset( - reserved_spans, reserved_count * _memory_span_size); - if (_memory_global_reserve) { - _rpmalloc_span_mark_as_subspan_unless_master( - _memory_global_reserve_master, _memory_global_reserve, - _memory_global_reserve_count); - _rpmalloc_span_unmap(_memory_global_reserve); - } - _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); - } - _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, - reserved_count); - } - return span; -} - -//! Map in memory pages for the given number of spans (or use previously -//! reserved pages) -static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) { - if (span_count <= heap->spans_reserved) - return _rpmalloc_span_map_from_reserve(heap, span_count); - span_t *span = 0; - int use_global_reserve = - (_memory_page_size > _memory_span_size) || - (_memory_span_map_count > _memory_heap_reserve_count); - if (use_global_reserve) { - // If huge pages, make sure only one thread maps more memory to avoid bloat - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - if (_memory_global_reserve_count >= span_count) { - size_t reserve_count = - (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); - if (_memory_global_reserve_count < reserve_count) - reserve_count = _memory_global_reserve_count; - span = _rpmalloc_global_get_reserved_spans(reserve_count); - if (span) { - if (reserve_count > span_count) { - span_t *reserved_span = (span_t *)pointer_offset( - span, span_count << _memory_span_size_shift); - _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, - reserved_span, - reserve_count - span_count); - } - // Already marked as subspan in _rpmalloc_global_get_reserved_spans - span->span_count = (uint32_t)span_count; - } - } - } - if (!span) - span = _rpmalloc_span_map_aligned_count(heap, span_count); - if (use_global_reserve) - atomic_store32_release(&_memory_global_lock, 0); - return span; -} - -//! Unmap memory pages for the given number of spans (or mark as unused if no -//! partial unmappings) -static void _rpmalloc_span_unmap(span_t *span) { - rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || - (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || - !(span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - - int is_master = !!(span->flags & SPAN_FLAG_MASTER); - span_t *master = - is_master ? span - : ((span_t *)pointer_offset( - span, -(intptr_t)((uintptr_t)span->offset_from_master * - _memory_span_size))); - rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); - - size_t span_count = span->span_count; - if (!is_master) { - // Directly unmap subspans (unless huge pages, in which case we defer and - // unmap entire page range with master) - rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); - if (_memory_span_size >= _memory_page_size) - _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); - } else { - // Special double flag to denote an unmapped master - // It must be kept in memory since span header must be used - span->flags |= - SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; - _rpmalloc_stat_add(&_unmapped_master_spans, 1); - } - - if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { - // Everything unmapped, unmap the master span with release flag to unmap the - // entire range of the super span - rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && - !!(master->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - size_t unmap_count = master->span_count; - if (_memory_span_size < _memory_page_size) - unmap_count = master->total_spans; - _rpmalloc_stat_sub(&_master_spans, 1); - _rpmalloc_stat_sub(&_unmapped_master_spans, 1); - _rpmalloc_unmap(master, unmap_count * _memory_span_size, - master->align_offset, - (size_t)master->total_spans * _memory_span_size); - } -} - -//! Move the span (used for small or medium allocations) to the heap thread -//! cache -static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) { - rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); - rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, - "Invalid span size class"); - rpmalloc_assert(span->span_count == 1, "Invalid span count"); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - atomic_decr32(&heap->span_use[0].current); -#endif - _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); - if (!heap->finalize) { - _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); - _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); - if (heap->size_class[span->size_class].cache) - _rpmalloc_heap_cache_insert(heap, - heap->size_class[span->size_class].cache); - heap->size_class[span->size_class].cache = span; - } else { - _rpmalloc_span_unmap(span); - } -} - -//! Initialize a (partial) free list up to next system memory page, while -//! reserving the first block as allocated, returning number of blocks in list -static uint32_t free_list_partial_init(void **list, void **first_block, - void *page_start, void *block_start, - uint32_t block_count, - uint32_t block_size) { - rpmalloc_assert(block_count, "Internal failure"); - *first_block = block_start; - if (block_count > 1) { - void *free_block = pointer_offset(block_start, block_size); - void *block_end = - pointer_offset(block_start, (size_t)block_size * block_count); - // If block size is less than half a memory page, bound init to next memory - // page boundary - if (block_size < (_memory_page_size >> 1)) { - void *page_end = pointer_offset(page_start, _memory_page_size); - if (page_end < block_end) - block_end = page_end; - } - *list = free_block; - block_count = 2; - void *next_block = pointer_offset(free_block, block_size); - while (next_block < block_end) { - *((void **)free_block) = next_block; - free_block = next_block; - ++block_count; - next_block = pointer_offset(next_block, block_size); - } - *((void **)free_block) = 0; - } else { - *list = 0; - } - return block_count; -} - -//! Initialize an unused span (from cache or mapped) to be new active span, -//! putting the initial free list in heap class free list -static void *_rpmalloc_span_initialize_new(heap_t *heap, - heap_size_class_t *heap_size_class, - span_t *span, uint32_t class_idx) { - rpmalloc_assert(span->span_count == 1, "Internal failure"); - size_class_t *size_class = _memory_size_class + class_idx; - span->size_class = class_idx; - span->heap = heap; - span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; - span->block_size = size_class->block_size; - span->block_count = size_class->block_count; - span->free_list = 0; - span->list_size = 0; - atomic_store_ptr_release(&span->free_list_deferred, 0); - - // Setup free list. Only initialize one system page worth of free blocks in - // list - void *block; - span->free_list_limit = - free_list_partial_init(&heap_size_class->free_list, &block, span, - pointer_offset(span, SPAN_HEADER_SIZE), - size_class->block_count, size_class->block_size); - // Link span as partial if there remains blocks to be initialized as free - // list, or full if fully initialized - if (span->free_list_limit < span->block_count) { - _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); - span->used_count = span->free_list_limit; - } else { -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); -#endif - ++heap->full_span_count; - span->used_count = span->block_count; - } - return block; -} - -static void _rpmalloc_span_extract_free_list_deferred(span_t *span) { - // We need acquire semantics on the CAS operation since we are interested in - // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for - // further comments on this dependency - do { - span->free_list = - atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); - } while (span->free_list == INVALID_POINTER); - span->used_count -= span->list_size; - span->list_size = 0; - atomic_store_ptr_release(&span->free_list_deferred, 0); -} - -static int _rpmalloc_span_is_fully_utilized(span_t *span) { - rpmalloc_assert(span->free_list_limit <= span->block_count, - "Span free list corrupted"); - return !span->free_list && (span->free_list_limit >= span->block_count); -} - -static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span, - span_t **list_head) { - void *free_list = heap->size_class[iclass].free_list; - span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask); - if (span == class_span) { - // Adopt the heap class free list back into the span free list - void *block = span->free_list; - void *last_block = 0; - while (block) { - last_block = block; - block = *((void **)block); - } - uint32_t free_count = 0; - block = free_list; - while (block) { - ++free_count; - block = *((void **)block); - } - if (last_block) { - *((void **)last_block) = free_list; - } else { - span->free_list = free_list; - } - heap->size_class[iclass].free_list = 0; - span->used_count -= free_count; - } - // If this assert triggers you have memory leaks - rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); - if (span->list_size == span->used_count) { - _rpmalloc_stat_dec(&heap->span_use[0].current); - _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); - // This function only used for spans in double linked lists - if (list_head) - _rpmalloc_span_double_link_list_remove(list_head, span); - _rpmalloc_span_unmap(span); - return 1; - } - return 0; -} - -//////////// -/// -/// Global cache -/// -////// - -#if ENABLE_GLOBAL_CACHE - -//! Finalize a global cache -static void _rpmalloc_global_cache_finalize(global_cache_t *cache) { - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - - for (size_t ispan = 0; ispan < cache->count; ++ispan) - _rpmalloc_span_unmap(cache->span[ispan]); - cache->count = 0; - - while (cache->overflow) { - span_t *span = cache->overflow; - cache->overflow = span->next; - _rpmalloc_span_unmap(span); - } - - atomic_store32_release(&cache->lock, 0); -} - -static void _rpmalloc_global_cache_insert_spans(span_t **span, - size_t span_count, - size_t count) { - const size_t cache_limit = - (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE - : GLOBAL_CACHE_MULTIPLIER * - (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); - - global_cache_t *cache = &_memory_span_cache[span_count - 1]; - - size_t insert_count = count; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - -#if ENABLE_STATISTICS - cache->insert_count += count; -#endif - if ((cache->count + insert_count) > cache_limit) - insert_count = cache_limit - cache->count; - - memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count); - cache->count += (uint32_t)insert_count; - -#if ENABLE_UNLIMITED_CACHE - while (insert_count < count) { -#else - // Enable unlimited cache if huge pages, or we will leak since it is unlikely - // that an entire huge page will be unmapped, and we're unable to partially - // decommit a huge page - while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { -#endif - span_t *current_span = span[insert_count++]; - current_span->next = cache->overflow; - cache->overflow = current_span; - } - atomic_store32_release(&cache->lock, 0); - - span_t *keep = 0; - for (size_t ispan = insert_count; ispan < count; ++ispan) { - span_t *current_span = span[ispan]; - // Keep master spans that has remaining subspans to avoid dangling them - if ((current_span->flags & SPAN_FLAG_MASTER) && - (atomic_load32(¤t_span->remaining_spans) > - (int32_t)current_span->span_count)) { - current_span->next = keep; - keep = current_span; - } else { - _rpmalloc_span_unmap(current_span); - } - } - - if (keep) { - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - - size_t islot = 0; - while (keep) { - for (; islot < cache->count; ++islot) { - span_t *current_span = cache->span[islot]; - if (!(current_span->flags & SPAN_FLAG_MASTER) || - ((current_span->flags & SPAN_FLAG_MASTER) && - (atomic_load32(¤t_span->remaining_spans) <= - (int32_t)current_span->span_count))) { - _rpmalloc_span_unmap(current_span); - cache->span[islot] = keep; - break; - } - } - if (islot == cache->count) - break; - keep = keep->next; - } - - if (keep) { - span_t *tail = keep; - while (tail->next) - tail = tail->next; - tail->next = cache->overflow; - cache->overflow = keep; - } - - atomic_store32_release(&cache->lock, 0); - } -} - -static size_t _rpmalloc_global_cache_extract_spans(span_t **span, - size_t span_count, - size_t count) { - global_cache_t *cache = &_memory_span_cache[span_count - 1]; - - size_t extract_count = 0; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - -#if ENABLE_STATISTICS - cache->extract_count += count; -#endif - size_t want = count - extract_count; - if (want > cache->count) - want = cache->count; - - memcpy(span + extract_count, cache->span + (cache->count - want), - sizeof(span_t *) * want); - cache->count -= (uint32_t)want; - extract_count += want; - - while ((extract_count < count) && cache->overflow) { - span_t *current_span = cache->overflow; - span[extract_count++] = current_span; - cache->overflow = current_span->next; - } - -#if ENABLE_ASSERTS - for (size_t ispan = 0; ispan < extract_count; ++ispan) { - rpmalloc_assert(span[ispan]->span_count == span_count, - "Global cache span count mismatch"); - } -#endif - - atomic_store32_release(&cache->lock, 0); - - return extract_count; -} - -#endif - -//////////// -/// -/// Heap control -/// -////// - -static void _rpmalloc_deallocate_huge(span_t *); - -//! Store the given spans as reserve in the given heap -static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, - span_t *reserve, - size_t reserve_span_count) { - heap->span_reserve_master = master; - heap->span_reserve = reserve; - heap->spans_reserved = (uint32_t)reserve_span_count; -} - -//! Adopt the deferred span cache list, optionally extracting the first single -//! span for immediate re-use -static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap, - span_t **single_span) { - span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire( - &heap->span_free_deferred, 0)); - while (span) { - span_t *next_span = (span_t *)span->free_list; - rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { - rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); - --heap->full_span_count; - _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], - span); -#endif - _rpmalloc_stat_dec(&heap->span_use[0].current); - _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); - if (single_span && !*single_span) - *single_span = span; - else - _rpmalloc_heap_cache_insert(heap, span); - } else { - if (span->size_class == SIZE_CLASS_HUGE) { - _rpmalloc_deallocate_huge(span); - } else { - rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, - "Span size class invalid"); - rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); - --heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); -#endif - uint32_t idx = span->span_count - 1; - _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); - _rpmalloc_stat_dec(&heap->span_use[idx].current); - if (!idx && single_span && !*single_span) - *single_span = span; - else - _rpmalloc_heap_cache_insert(heap, span); - } - } - span = next_span; - } -} - -static void _rpmalloc_heap_unmap(heap_t *heap) { - if (!heap->master_heap) { - if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { - span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask); - _rpmalloc_span_unmap(span); - } - } else { - if (atomic_decr32(&heap->master_heap->child_count) == 0) { - _rpmalloc_heap_unmap(heap->master_heap); - } - } -} - -static void _rpmalloc_heap_global_finalize(heap_t *heap) { - if (heap->finalize++ > 1) { - --heap->finalize; - return; - } - - _rpmalloc_heap_finalize(heap); - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - span_cache->count = 0; - } -#endif - - if (heap->full_span_count) { - --heap->finalize; - return; - } - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (heap->size_class[iclass].free_list || - heap->size_class[iclass].partial_span) { - --heap->finalize; - return; - } - } - // Heap is now completely free, unmap and remove from heap list - size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; - heap_t *list_heap = _memory_heaps[list_idx]; - if (list_heap == heap) { - _memory_heaps[list_idx] = heap->next_heap; - } else { - while (list_heap->next_heap != heap) - list_heap = list_heap->next_heap; - list_heap->next_heap = heap->next_heap; - } - - _rpmalloc_heap_unmap(heap); -} - -//! Insert a single span into thread heap cache, releasing to global cache if -//! overflow -static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) { - if (UNEXPECTED(heap->finalize != 0)) { - _rpmalloc_span_unmap(span); - _rpmalloc_heap_global_finalize(heap); - return; - } -#if ENABLE_THREAD_CACHE - size_t span_count = span->span_count; - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); - if (span_count == 1) { - span_cache_t *span_cache = &heap->span_cache; - span_cache->span[span_cache->count++] = span; - if (span_cache->count == MAX_THREAD_SPAN_CACHE) { - const size_t remain_count = - MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, - THREAD_SPAN_CACHE_TRANSFER); - _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, - span_count, - THREAD_SPAN_CACHE_TRANSFER); -#else - for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) - _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); -#endif - span_cache->count = remain_count; - } - } else { - size_t cache_idx = span_count - 2; - span_large_cache_t *span_cache = heap->span_large_cache + cache_idx; - span_cache->span[span_cache->count++] = span; - const size_t cache_limit = - (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); - if (span_cache->count == cache_limit) { - const size_t transfer_limit = 2 + (cache_limit >> 2); - const size_t transfer_count = - (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit - ? THREAD_SPAN_LARGE_CACHE_TRANSFER - : transfer_limit); - const size_t remain_count = cache_limit - transfer_count; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - transfer_count * span_count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, - transfer_count); - _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, - span_count, transfer_count); -#else - for (size_t ispan = 0; ispan < transfer_count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); -#endif - span_cache->count = remain_count; - } - } -#else - (void)sizeof(heap); - _rpmalloc_span_unmap(span); -#endif -} - -//! Extract the given number of spans from the different cache levels -static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap, - size_t span_count) { - span_t *span = 0; -#if ENABLE_THREAD_CACHE - span_cache_t *span_cache; - if (span_count == 1) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); - if (span_cache->count) { - _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); - return span_cache->span[--span_cache->count]; - } -#endif - return span; -} - -static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap, - size_t span_count) { - span_t *span = 0; - if (span_count == 1) { - _rpmalloc_heap_cache_adopt_deferred(heap, &span); - } else { - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - span = _rpmalloc_heap_thread_cache_extract(heap, span_count); - } - return span; -} - -static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap, - size_t span_count) { - if (heap->spans_reserved >= span_count) - return _rpmalloc_span_map(heap, span_count); - return 0; -} - -//! Extract a span from the global cache -static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap, - size_t span_count) { -#if ENABLE_GLOBAL_CACHE -#if ENABLE_THREAD_CACHE - span_cache_t *span_cache; - size_t wanted_count; - if (span_count == 1) { - span_cache = &heap->span_cache; - wanted_count = THREAD_SPAN_CACHE_TRANSFER; - } else { - span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); - wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; - } - span_cache->count = _rpmalloc_global_cache_extract_spans( - span_cache->span, span_count, wanted_count); - if (span_cache->count) { - _rpmalloc_stat_add64(&heap->global_to_thread, - span_count * span_cache->count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, - span_cache->count); - return span_cache->span[--span_cache->count]; - } -#else - span_t *span = 0; - size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); - if (count) { - _rpmalloc_stat_add64(&heap->global_to_thread, - span_count * count * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, - count); - return span; - } -#endif -#endif - (void)sizeof(heap); - (void)sizeof(span_count); - return 0; -} - -static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count, - uint32_t class_idx) { - (void)sizeof(heap); - (void)sizeof(span_count); - (void)sizeof(class_idx); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - uint32_t idx = (uint32_t)span_count - 1; - uint32_t current_count = - (uint32_t)atomic_incr32(&heap->span_use[idx].current); - if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) - atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); - _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, - heap->size_class_use[class_idx].spans_peak); -#endif -} - -//! Get a span from one of the cache levels (thread cache, reserved, global -//! cache) or fallback to mapping more memory -static span_t * -_rpmalloc_heap_extract_new_span(heap_t *heap, - heap_size_class_t *heap_size_class, - size_t span_count, uint32_t class_idx) { - span_t *span; -#if ENABLE_THREAD_CACHE - if (heap_size_class && heap_size_class->cache) { - span = heap_size_class->cache; - heap_size_class->cache = - (heap->span_cache.count - ? heap->span_cache.span[--heap->span_cache.count] - : 0); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } -#endif - (void)sizeof(class_idx); - // Allow 50% overhead to increase cache hits - size_t base_span_count = span_count; - size_t limit_span_count = - (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; - if (limit_span_count > LARGE_CLASS_COUNT) - limit_span_count = LARGE_CLASS_COUNT; - do { - span = _rpmalloc_heap_thread_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_global_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - span = _rpmalloc_heap_reserved_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); - _rpmalloc_inc_span_statistics(heap, span_count, class_idx); - return span; - } - ++span_count; - } while (span_count <= limit_span_count); - // Final fallback, map in more virtual memory - span = _rpmalloc_span_map(heap, base_span_count); - _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); - _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); - return span; -} - -static void _rpmalloc_heap_initialize(heap_t *heap) { - _rpmalloc_memset_const(heap, 0, sizeof(heap_t)); - // Get a new heap ID - heap->id = 1 + atomic_incr32(&_memory_heap_id); - - // Link in heap in heap ID map - size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; - heap->next_heap = _memory_heaps[list_idx]; - _memory_heaps[list_idx] = heap; -} - -static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) { - heap->owner_thread = (uintptr_t)-1; -#if RPMALLOC_FIRST_CLASS_HEAPS - heap_t **heap_list = - (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); -#else - (void)sizeof(first_class); - heap_t **heap_list = &_memory_orphan_heaps; -#endif - heap->next_orphan = *heap_list; - *heap_list = heap; -} - -//! Allocate a new heap from newly mapped memory pages -static heap_t *_rpmalloc_heap_allocate_new(void) { - // Map in pages for a 16 heaps. If page size is greater than required size for - // this, map a page and use first part for heaps and remaining part for spans - // for allocations. Adds a lot of complexity, but saves a lot of memory on - // systems where page size > 64 spans (4MiB) - size_t heap_size = sizeof(heap_t); - size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); - size_t request_heap_count = 16; - size_t heap_span_count = ((aligned_heap_size * request_heap_count) + - sizeof(span_t) + _memory_span_size - 1) / - _memory_span_size; - size_t block_size = _memory_span_size * heap_span_count; - size_t span_count = heap_span_count; - span_t *span = 0; - // If there are global reserved spans, use these first - if (_memory_global_reserve_count >= heap_span_count) { - span = _rpmalloc_global_get_reserved_spans(heap_span_count); - } - if (!span) { - if (_memory_page_size > block_size) { - span_count = _memory_page_size / _memory_span_size; - block_size = _memory_page_size; - // If using huge pages, make sure to grab enough heaps to avoid - // reallocating a huge page just to serve new heaps - size_t possible_heap_count = - (block_size - sizeof(span_t)) / aligned_heap_size; - if (possible_heap_count >= (request_heap_count * 16)) - request_heap_count *= 16; - else if (possible_heap_count < request_heap_count) - request_heap_count = possible_heap_count; - heap_span_count = ((aligned_heap_size * request_heap_count) + - sizeof(span_t) + _memory_span_size - 1) / - _memory_span_size; - } - - size_t align_offset = 0; - span = (span_t *)_rpmalloc_mmap(block_size, &align_offset); - if (!span) - return 0; - - // Master span will contain the heaps - _rpmalloc_stat_inc(&_master_spans); - _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); - } - - size_t remain_size = _memory_span_size - sizeof(span_t); - heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t)); - _rpmalloc_heap_initialize(heap); - - // Put extra heaps as orphans - size_t num_heaps = remain_size / aligned_heap_size; - if (num_heaps < request_heap_count) - num_heaps = request_heap_count; - atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); - heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size); - while (num_heaps > 1) { - _rpmalloc_heap_initialize(extra_heap); - extra_heap->master_heap = heap; - _rpmalloc_heap_orphan(extra_heap, 1); - extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size); - --num_heaps; - } - - if (span_count > heap_span_count) { - // Cap reserved spans - size_t remain_count = span_count - heap_span_count; - size_t reserve_count = - (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count - : remain_count); - span_t *remain_span = - (span_t *)pointer_offset(span, heap_span_count * _memory_span_size); - _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); - - if (remain_count > reserve_count) { - // Set to global reserved spans - remain_span = (span_t *)pointer_offset(remain_span, - reserve_count * _memory_span_size); - reserve_count = remain_count - reserve_count; - _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); - } - } - - return heap; -} - -static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) { - heap_t *heap = *heap_list; - *heap_list = (heap ? heap->next_orphan : 0); - return heap; -} - -//! Allocate a new heap, potentially reusing a previously orphaned heap -static heap_t *_rpmalloc_heap_allocate(int first_class) { - heap_t *heap = 0; - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - if (first_class == 0) - heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); -#if RPMALLOC_FIRST_CLASS_HEAPS - if (!heap) - heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); -#endif - if (!heap) - heap = _rpmalloc_heap_allocate_new(); - atomic_store32_release(&_memory_global_lock, 0); - if (heap) - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - return heap; -} - -static void _rpmalloc_heap_release(void *heapptr, int first_class, - int release_cache) { - heap_t *heap = (heap_t *)heapptr; - if (!heap) - return; - // Release thread cache spans back to global cache - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - if (release_cache || heap->finalize) { -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - if (!span_cache->count) - continue; -#if ENABLE_GLOBAL_CACHE - if (heap->finalize) { - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - } else { - _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * - (iclass + 1) * - _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, - span_cache->count); - _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, - span_cache->count); - } -#else - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); -#endif - span_cache->count = 0; - } -#endif - } - - if (get_thread_heap_raw() == heap) - set_thread_heap(0); - -#if ENABLE_STATISTICS - atomic_decr32(&_memory_active_heaps); - rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, - "Still active heaps during finalization"); -#endif - - // If we are forcibly terminating with _exit the state of the - // lock atomic is unknown and it's best to just go ahead and exit - if (get_thread_id() != _rpmalloc_main_thread_id) { - while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) - _rpmalloc_spin(); - } - _rpmalloc_heap_orphan(heap, first_class); - atomic_store32_release(&_memory_global_lock, 0); -} - -static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) { - _rpmalloc_heap_release(heapptr, 0, release_cache); -} - -static void _rpmalloc_heap_release_raw_fc(void *heapptr) { - _rpmalloc_heap_release_raw(heapptr, 1); -} - -static void _rpmalloc_heap_finalize(heap_t *heap) { - if (heap->spans_reserved) { - span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved); - _rpmalloc_span_unmap(span); - heap->spans_reserved = 0; - } - - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (heap->size_class[iclass].cache) - _rpmalloc_span_unmap(heap->size_class[iclass].cache); - heap->size_class[iclass].cache = 0; - span_t *span = heap->size_class[iclass].partial_span; - while (span) { - span_t *next = span->next; - _rpmalloc_span_finalize(heap, iclass, span, - &heap->size_class[iclass].partial_span); - span = next; - } - // If class still has a free list it must be a full span - if (heap->size_class[iclass].free_list) { - span_t *class_span = - (span_t *)((uintptr_t)heap->size_class[iclass].free_list & - _memory_span_mask); - span_t **list = 0; -#if RPMALLOC_FIRST_CLASS_HEAPS - list = &heap->full_span[iclass]; -#endif - --heap->full_span_count; - if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { - if (list) - _rpmalloc_span_double_link_list_remove(list, class_span); - _rpmalloc_span_double_link_list_add( - &heap->size_class[iclass].partial_span, class_span); - } - } - } - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); - span_cache->count = 0; - } -#endif - rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), - "Heaps still active during finalization"); -} - -//////////// -/// -/// Allocation entry points -/// -////// - -//! Pop first block from a free list -static void *free_list_pop(void **list) { - void *block = *list; - *list = *((void **)block); - return block; -} - -//! Allocate a small/medium sized memory block from the given heap -static void *_rpmalloc_allocate_from_heap_fallback( - heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) { - span_t *span = heap_size_class->partial_span; - rpmalloc_assume(heap != 0); - if (EXPECTED(span != 0)) { - rpmalloc_assert(span->block_count == - _memory_size_class[span->size_class].block_count, - "Span block count corrupted"); - rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), - "Internal failure"); - void *block; - if (span->free_list) { - // Span local free list is not empty, swap to size class free list - block = free_list_pop(&span->free_list); - heap_size_class->free_list = span->free_list; - span->free_list = 0; - } else { - // If the span did not fully initialize free list, link up another page - // worth of blocks - void *block_start = pointer_offset( - span, SPAN_HEADER_SIZE + - ((size_t)span->free_list_limit * span->block_size)); - span->free_list_limit += free_list_partial_init( - &heap_size_class->free_list, &block, - (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)), - block_start, span->block_count - span->free_list_limit, - span->block_size); - } - rpmalloc_assert(span->free_list_limit <= span->block_count, - "Span block count corrupted"); - span->used_count = span->free_list_limit; - - // Swap in deferred free list if present - if (atomic_load_ptr(&span->free_list_deferred)) - _rpmalloc_span_extract_free_list_deferred(span); - - // If span is still not fully utilized keep it in partial list and early - // return block - if (!_rpmalloc_span_is_fully_utilized(span)) - return block; - - // The span is fully utilized, unlink from partial list and add to fully - // utilized list - _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, - span); -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); -#endif - ++heap->full_span_count; - return block; - } - - // Find a span in one of the cache levels - span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); - if (EXPECTED(span != 0)) { - // Mark span as owned by this heap and set base data, return first block - return _rpmalloc_span_initialize_new(heap, heap_size_class, span, - class_idx); - } - - return 0; -} - -//! Allocate a small sized memory block from the given heap -static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Small sizes have unique size classes - const uint32_t class_idx = - (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); - heap_size_class_t *heap_size_class = heap->size_class + class_idx; - _rpmalloc_stat_inc_alloc(heap, class_idx); - if (EXPECTED(heap_size_class->free_list != 0)) - return free_list_pop(&heap_size_class->free_list); - return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, - class_idx); -} - -//! Allocate a medium sized memory block from the given heap -static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Calculate the size class index and do a dependent lookup of the final class - // index (in case of merged classes) - const uint32_t base_idx = - (uint32_t)(SMALL_CLASS_COUNT + - ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); - const uint32_t class_idx = _memory_size_class[base_idx].class_idx; - heap_size_class_t *heap_size_class = heap->size_class + class_idx; - _rpmalloc_stat_inc_alloc(heap, class_idx); - if (EXPECTED(heap_size_class->free_list != 0)) - return free_list_pop(&heap_size_class->free_list); - return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, - class_idx); -} - -//! Allocate a large sized memory block from the given heap -static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - // Calculate number of needed max sized spans (including header) - // Since this function is never called if size > LARGE_SIZE_LIMIT - // the span_count is guaranteed to be <= LARGE_CLASS_COUNT - size += SPAN_HEADER_SIZE; - size_t span_count = size >> _memory_span_size_shift; - if (size & (_memory_span_size - 1)) - ++span_count; - - // Find a span in one of the cache levels - span_t *span = - _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); - if (!span) - return span; - - // Mark span as owned by this heap and set base data - rpmalloc_assert(span->span_count >= span_count, "Internal failure"); - span->size_class = SIZE_CLASS_LARGE; - span->heap = heap; - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - return pointer_offset(span, SPAN_HEADER_SIZE); -} - -//! Allocate a huge block by mapping memory pages directly -static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) { - rpmalloc_assert(heap, "No thread heap"); - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - size += SPAN_HEADER_SIZE; - size_t num_pages = size >> _memory_page_size_shift; - if (size & (_memory_page_size - 1)) - ++num_pages; - size_t align_offset = 0; - span_t *span = - (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); - if (!span) - return span; - - // Store page count in span_count - span->size_class = SIZE_CLASS_HUGE; - span->span_count = (uint32_t)num_pages; - span->align_offset = (uint32_t)align_offset; - span->heap = heap; - _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - return pointer_offset(span, SPAN_HEADER_SIZE); -} - -//! Allocate a block of the given size -static void *_rpmalloc_allocate(heap_t *heap, size_t size) { - _rpmalloc_stat_add64(&_allocation_counter, 1); - if (EXPECTED(size <= SMALL_SIZE_LIMIT)) - return _rpmalloc_allocate_small(heap, size); - else if (size <= _memory_medium_size_limit) - return _rpmalloc_allocate_medium(heap, size); - else if (size <= LARGE_SIZE_LIMIT) - return _rpmalloc_allocate_large(heap, size); - return _rpmalloc_allocate_huge(heap, size); -} - -static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment, - size_t size) { - if (alignment <= SMALL_GRANULARITY) - return _rpmalloc_allocate(heap, size); - -#if ENABLE_VALIDATE_ARGS - if ((size + alignment) < size) { - errno = EINVAL; - return 0; - } - if (alignment & (alignment - 1)) { - errno = EINVAL; - return 0; - } -#endif - - if ((alignment <= SPAN_HEADER_SIZE) && - ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) { - // If alignment is less or equal to span header size (which is power of - // two), and size aligned to span header size multiples is less than size + - // alignment, then use natural alignment of blocks to provide alignment - size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & - ~(uintptr_t)(SPAN_HEADER_SIZE - 1) - : SPAN_HEADER_SIZE; - rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), - "Failed alignment calculation"); - if (multiple_size <= (size + alignment)) - return _rpmalloc_allocate(heap, multiple_size); - } - - void *ptr = 0; - size_t align_mask = alignment - 1; - if (alignment <= _memory_page_size) { - ptr = _rpmalloc_allocate(heap, size + alignment); - if ((uintptr_t)ptr & align_mask) { - ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - // Mark as having aligned blocks - span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; - } - return ptr; - } - - // Fallback to mapping new pages for this request. Since pointers passed - // to rpfree must be able to reach the start of the span by bitmasking of - // the address with the span size, the returned aligned pointer from this - // function must be with a span size of the start of the mapped area. - // In worst case this requires us to loop and map pages until we get a - // suitable memory address. It also means we can never align to span size - // or greater, since the span header will push alignment more than one - // span size away from span start (thus causing pointer mask to give us - // an invalid span start on free) - if (alignment & align_mask) { - errno = EINVAL; - return 0; - } - if (alignment >= _memory_span_size) { - errno = EINVAL; - return 0; - } - - size_t extra_pages = alignment / _memory_page_size; - - // Since each span has a header, we will at least need one extra memory page - size_t num_pages = 1 + (size / _memory_page_size); - if (size & (_memory_page_size - 1)) - ++num_pages; - - if (extra_pages > num_pages) - num_pages = 1 + extra_pages; - - size_t original_pages = num_pages; - size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; - if (limit_pages < (original_pages * 2)) - limit_pages = original_pages * 2; - - size_t mapped_size, align_offset; - span_t *span; - -retry: - align_offset = 0; - mapped_size = num_pages * _memory_page_size; - - span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset); - if (!span) { - errno = ENOMEM; - return 0; - } - ptr = pointer_offset(span, SPAN_HEADER_SIZE); - - if ((uintptr_t)ptr & align_mask) - ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - - if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || - (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || - (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { - _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); - ++num_pages; - if (num_pages > limit_pages) { - errno = EINVAL; - return 0; - } - goto retry; - } - - // Store page count in span_count - span->size_class = SIZE_CLASS_HUGE; - span->span_count = (uint32_t)num_pages; - span->align_offset = (uint32_t)align_offset; - span->heap = heap; - _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); - -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); -#endif - ++heap->full_span_count; - - _rpmalloc_stat_add64(&_allocation_counter, 1); - - return ptr; -} - -//////////// -/// -/// Deallocation entry points -/// -////// - -//! Deallocate the given small/medium memory block in the current thread local -//! heap -static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span, - void *block) { - heap_t *heap = span->heap; - rpmalloc_assert(heap->owner_thread == get_thread_id() || - !heap->owner_thread || heap->finalize, - "Internal failure"); - // Add block to free list - if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { - span->used_count = span->block_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], - span); -#endif - _rpmalloc_span_double_link_list_add( - &heap->size_class[span->size_class].partial_span, span); - --heap->full_span_count; - } - *((void **)block) = span->free_list; - --span->used_count; - span->free_list = block; - if (UNEXPECTED(span->used_count == span->list_size)) { - // If there are no used blocks it is guaranteed that no other external - // thread is accessing the span - if (span->used_count) { - // Make sure we have synchronized the deferred list and list size by using - // acquire semantics and guarantee that no external thread is accessing - // span concurrently - void *free_list; - do { - free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, - INVALID_POINTER); - } while (free_list == INVALID_POINTER); - atomic_store_ptr_release(&span->free_list_deferred, free_list); - } - _rpmalloc_span_double_link_list_remove( - &heap->size_class[span->size_class].partial_span, span); - _rpmalloc_span_release_to_cache(heap, span); - } -} - -static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) { - if (span->size_class != SIZE_CLASS_HUGE) - _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); - // This list does not need ABA protection, no mutable side state - do { - span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred); - } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); -} - -//! Put the block in the deferred free list of the owning span -static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span, - void *block) { - // The memory ordering here is a bit tricky, to avoid having to ABA protect - // the deferred free list to avoid desynchronization of list and list size - // we need to have acquire semantics on successful CAS of the pointer to - // guarantee the list_size variable validity + release semantics on pointer - // store - void *free_list; - do { - free_list = - atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); - } while (free_list == INVALID_POINTER); - *((void **)block) = free_list; - uint32_t free_count = ++span->list_size; - int all_deferred_free = (free_count == span->block_count); - atomic_store_ptr_release(&span->free_list_deferred, block); - if (all_deferred_free) { - // Span was completely freed by this block. Due to the INVALID_POINTER spin - // lock no other thread can reach this state simultaneously on this span. - // Safe to move to owner heap deferred cache - _rpmalloc_deallocate_defer_free_span(span->heap, span); - } -} - -static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) { - _rpmalloc_stat_inc_free(span->heap, span->size_class); - if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { - // Realign pointer to block start - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); - } - // Check if block belongs to this heap or if deallocation should be deferred -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (!defer) - _rpmalloc_deallocate_direct_small_or_medium(span, p); - else - _rpmalloc_deallocate_defer_small_or_medium(span, p); -} - -//! Deallocate the given large memory block to the current heap -static void _rpmalloc_deallocate_large(span_t *span) { - rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); - rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || - !(span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || - (span->flags & SPAN_FLAG_SUBSPAN), - "Span flag corrupted"); - // We must always defer (unless finalizing) if from another heap since we - // cannot touch the list or counters of another heap -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (defer) { - _rpmalloc_deallocate_defer_free_span(span->heap, span); - return; - } - rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); - --span->heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - // Decrease counter - size_t idx = span->span_count - 1; - atomic_decr32(&span->heap->span_use[idx].current); -#endif - heap_t *heap = span->heap; - rpmalloc_assert(heap, "No thread heap"); -#if ENABLE_THREAD_CACHE - const int set_as_reserved = - ((span->span_count > 1) && (heap->span_cache.count == 0) && - !heap->finalize && !heap->spans_reserved); -#else - const int set_as_reserved = - ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); -#endif - if (set_as_reserved) { - heap->span_reserve = span; - heap->spans_reserved = span->span_count; - if (span->flags & SPAN_FLAG_MASTER) { - heap->span_reserve_master = span; - } else { // SPAN_FLAG_SUBSPAN - span_t *master = (span_t *)pointer_offset( - span, - -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); - heap->span_reserve_master = master; - rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); - rpmalloc_assert(atomic_load32(&master->remaining_spans) >= - (int32_t)span->span_count, - "Master span count corrupted"); - } - _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); - } else { - // Insert into cache list - _rpmalloc_heap_cache_insert(heap, span); - } -} - -//! Deallocate the given huge span -static void _rpmalloc_deallocate_huge(span_t *span) { - rpmalloc_assert(span->heap, "No span heap"); -#if RPMALLOC_FIRST_CLASS_HEAPS - int defer = - (span->heap->owner_thread && - (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#else - int defer = - ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); -#endif - if (defer) { - _rpmalloc_deallocate_defer_free_span(span->heap, span); - return; - } - rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); - --span->heap->full_span_count; -#if RPMALLOC_FIRST_CLASS_HEAPS - _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); -#endif - - // Oversized allocation, page count is stored in span_count - size_t num_pages = span->span_count; - _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, - num_pages * _memory_page_size); - _rpmalloc_stat_sub(&_huge_pages_current, num_pages); -} - -//! Deallocate the given block -static void _rpmalloc_deallocate(void *p) { - _rpmalloc_stat_add64(&_deallocation_counter, 1); - // Grab the span (always at start of span, using span alignment) - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (UNEXPECTED(!span)) - return; - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) - _rpmalloc_deallocate_small_or_medium(span, p); - else if (span->size_class == SIZE_CLASS_LARGE) - _rpmalloc_deallocate_large(span); - else - _rpmalloc_deallocate_huge(span); -} - -//////////// -/// -/// Reallocation entry points -/// -////// - -static size_t _rpmalloc_usable_size(void *p); - -//! Reallocate the given block to the given size -static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size, - size_t oldsize, unsigned int flags) { - if (p) { - // Grab the span using guaranteed span alignment - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { - // Small/medium sized block - rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - uint32_t block_idx = block_offset / span->block_size; - void *block = - pointer_offset(blocks_start, (size_t)block_idx * span->block_size); - if (!oldsize) - oldsize = - (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); - if ((size_t)span->block_size >= size) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else if (span->size_class == SIZE_CLASS_LARGE) { - // Large block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_spans = total_size >> _memory_span_size_shift; - if (total_size & (_memory_span_mask - 1)) - ++num_spans; - size_t current_spans = span->span_count; - void *block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_spans * _memory_span_size) - - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else { - // Oversized block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_pages = total_size >> _memory_page_size_shift; - if (total_size & (_memory_page_size - 1)) - ++num_pages; - // Page count is stored in span_count - size_t current_pages = span->span_count; - void *block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_pages * _memory_page_size) - - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { - // Still fits in block, never mind trying to save memory, but preserve - // data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - oldsize = 0; - } - - if (!!(flags & RPMALLOC_GROW_OR_FAIL)) - return 0; - - // Size is greater than block size, need to allocate a new block and - // deallocate the old Avoid hysteresis by overallocating if increase is small - // (below 37%) - size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); - size_t new_size = - (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); - void *block = _rpmalloc_allocate(heap, new_size); - if (p && block) { - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, p, oldsize < new_size ? oldsize : new_size); - _rpmalloc_deallocate(p); - } - - return block; -} - -static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr, - size_t alignment, size_t size, - size_t oldsize, unsigned int flags) { - if (alignment <= SMALL_GRANULARITY) - return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); - - int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); - size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); - if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { - if (no_alloc || (size >= (usablesize / 2))) - return ptr; - } - // Aligned alloc marks span as having aligned blocks - void *block = - (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); - if (EXPECTED(block != 0)) { - if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { - if (!oldsize) - oldsize = usablesize; - memcpy(block, ptr, oldsize < size ? oldsize : size); - } - _rpmalloc_deallocate(ptr); - } - return block; -} - -//////////// -/// -/// Initialization, finalization and utility -/// -////// - -//! Get the usable size of the given block -static size_t _rpmalloc_usable_size(void *p) { - // Grab the span using guaranteed span alignment - span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); - if (span->size_class < SIZE_CLASS_COUNT) { - // Small/medium block - void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - return span->block_size - - ((size_t)pointer_diff(p, blocks_start) % span->block_size); - } - if (span->size_class == SIZE_CLASS_LARGE) { - // Large block - size_t current_spans = span->span_count; - return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); - } - // Oversized block, page count is stored in span_count - size_t current_pages = span->span_count; - return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); -} - -//! Adjust and optimize the size class properties for the given class -static void _rpmalloc_adjust_size_class(size_t iclass) { - size_t block_size = _memory_size_class[iclass].block_size; - size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; - - _memory_size_class[iclass].block_count = (uint16_t)block_count; - _memory_size_class[iclass].class_idx = (uint16_t)iclass; - - // Check if previous size classes can be merged - if (iclass >= SMALL_CLASS_COUNT) { - size_t prevclass = iclass; - while (prevclass > 0) { - --prevclass; - // A class can be merged if number of pages and number of blocks are equal - if (_memory_size_class[prevclass].block_count == - _memory_size_class[iclass].block_count) - _rpmalloc_memcpy_const(_memory_size_class + prevclass, - _memory_size_class + iclass, - sizeof(_memory_size_class[iclass])); - else - break; - } - } -} - -//! Initialize the allocator and setup global data -extern inline int rpmalloc_initialize(void) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - return rpmalloc_initialize_config(0); -} - -int rpmalloc_initialize_config(const rpmalloc_config_t *config) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - _rpmalloc_initialized = 1; - - if (config) - memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); - else - _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t)); - - if (!_memory_config.memory_map || !_memory_config.memory_unmap) { - _memory_config.memory_map = _rpmalloc_mmap_os; - _memory_config.memory_unmap = _rpmalloc_unmap_os; - } - -#if PLATFORM_WINDOWS - SYSTEM_INFO system_info; - memset(&system_info, 0, sizeof(system_info)); - GetSystemInfo(&system_info); - _memory_map_granularity = system_info.dwAllocationGranularity; -#else - _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); -#endif - -#if RPMALLOC_CONFIGURABLE - _memory_page_size = _memory_config.page_size; -#else - _memory_page_size = 0; -#endif - _memory_huge_pages = 0; - if (!_memory_page_size) { -#if PLATFORM_WINDOWS - _memory_page_size = system_info.dwPageSize; -#else - _memory_page_size = _memory_map_granularity; - if (_memory_config.enable_huge_pages) { -#if defined(__linux__) - size_t huge_page_size = 0; - FILE *meminfo = fopen("/proc/meminfo", "r"); - if (meminfo) { - char line[128]; - while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { - line[sizeof(line) - 1] = 0; - if (strstr(line, "Hugepagesize:")) - huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; - } - fclose(meminfo); - } - if (huge_page_size) { - _memory_huge_pages = 1; - _memory_page_size = huge_page_size; - _memory_map_granularity = huge_page_size; - } -#elif defined(__FreeBSD__) - int rc; - size_t sz = sizeof(rc); - - if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && - rc == 1) { - static size_t defsize = 2 * 1024 * 1024; - int nsize = 0; - size_t sizes[4] = {0}; - _memory_huge_pages = 1; - _memory_page_size = defsize; - if ((nsize = getpagesizes(sizes, 4)) >= 2) { - nsize--; - for (size_t csize = sizes[nsize]; nsize >= 0 && csize; - --nsize, csize = sizes[nsize]) { - //! Unlikely, but as a precaution.. - rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024), - "Invalid page size"); - if (defsize < csize) { - _memory_page_size = csize; - break; - } - } - } - _memory_map_granularity = _memory_page_size; - } -#elif defined(__APPLE__) || defined(__NetBSD__) - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; -#endif - } -#endif - } else { - if (_memory_config.enable_huge_pages) - _memory_huge_pages = 1; - } - -#if PLATFORM_WINDOWS - if (_memory_config.enable_huge_pages) { - HANDLE token = 0; - size_t large_page_minimum = GetLargePageMinimum(); - if (large_page_minimum) - OpenProcessToken(GetCurrentProcess(), - TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); - if (token) { - LUID luid; - if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { - TOKEN_PRIVILEGES token_privileges; - memset(&token_privileges, 0, sizeof(token_privileges)); - token_privileges.PrivilegeCount = 1; - token_privileges.Privileges[0].Luid = luid; - token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { - if (GetLastError() == ERROR_SUCCESS) - _memory_huge_pages = 1; - } - } - CloseHandle(token); - } - if (_memory_huge_pages) { - if (large_page_minimum > _memory_page_size) - _memory_page_size = large_page_minimum; - if (large_page_minimum > _memory_map_granularity) - _memory_map_granularity = large_page_minimum; - } - } -#endif - - size_t min_span_size = 256; - size_t max_page_size; -#if UINTPTR_MAX > 0xFFFFFFFF - max_page_size = 4096ULL * 1024ULL * 1024ULL; -#else - max_page_size = 4 * 1024 * 1024; -#endif - if (_memory_page_size < min_span_size) - _memory_page_size = min_span_size; - if (_memory_page_size > max_page_size) - _memory_page_size = max_page_size; - _memory_page_size_shift = 0; - size_t page_size_bit = _memory_page_size; - while (page_size_bit != 1) { - ++_memory_page_size_shift; - page_size_bit >>= 1; - } - _memory_page_size = ((size_t)1 << _memory_page_size_shift); - -#if RPMALLOC_CONFIGURABLE - if (!_memory_config.span_size) { - _memory_span_size = _memory_default_span_size; - _memory_span_size_shift = _memory_default_span_size_shift; - _memory_span_mask = _memory_default_span_mask; - } else { - size_t span_size = _memory_config.span_size; - if (span_size > (256 * 1024)) - span_size = (256 * 1024); - _memory_span_size = 4096; - _memory_span_size_shift = 12; - while (_memory_span_size < span_size) { - _memory_span_size <<= 1; - ++_memory_span_size_shift; - } - _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); - } -#endif - - _memory_span_map_count = - (_memory_config.span_map_count ? _memory_config.span_map_count - : DEFAULT_SPAN_MAP_COUNT); - if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - if ((_memory_page_size >= _memory_span_size) && - ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) - ? DEFAULT_SPAN_MAP_COUNT - : _memory_span_map_count; - - _memory_config.page_size = _memory_page_size; - _memory_config.span_size = _memory_span_size; - _memory_config.span_map_count = _memory_span_map_count; - _memory_config.enable_huge_pages = _memory_huge_pages; - -#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ - defined(__TINYC__) - if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) - return -1; -#endif -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - fls_key = FlsAlloc(&_rpmalloc_thread_destructor); -#endif - - // Setup all small and medium size classes - size_t iclass = 0; - _memory_size_class[iclass].block_size = SMALL_GRANULARITY; - _rpmalloc_adjust_size_class(iclass); - for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { - size_t size = iclass * SMALL_GRANULARITY; - _memory_size_class[iclass].block_size = (uint32_t)size; - _rpmalloc_adjust_size_class(iclass); - } - // At least two blocks per span, then fall back to large allocations - _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; - if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) - _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; - for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { - size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); - if (size > _memory_medium_size_limit) { - _memory_medium_size_limit = - SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY); - break; - } - _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; - _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); - } - - _memory_orphan_heaps = 0; -#if RPMALLOC_FIRST_CLASS_HEAPS - _memory_first_class_orphan_heaps = 0; -#endif -#if ENABLE_STATISTICS - atomic_store32(&_memory_active_heaps, 0); - atomic_store32(&_mapped_pages, 0); - _mapped_pages_peak = 0; - atomic_store32(&_master_spans, 0); - atomic_store32(&_mapped_total, 0); - atomic_store32(&_unmapped_total, 0); - atomic_store32(&_mapped_pages_os, 0); - atomic_store32(&_huge_pages_current, 0); - _huge_pages_peak = 0; -#endif - memset(_memory_heaps, 0, sizeof(_memory_heaps)); - atomic_store32_release(&_memory_global_lock, 0); - - rpmalloc_linker_reference(); - - // Initialize this thread - rpmalloc_thread_initialize(); - return 0; -} - -//! Finalize the allocator -void rpmalloc_finalize(void) { - rpmalloc_thread_finalize(1); - // rpmalloc_dump_statistics(stdout); - - if (_memory_global_reserve) { - atomic_add32(&_memory_global_reserve_master->remaining_spans, - -(int32_t)_memory_global_reserve_count); - _memory_global_reserve_master = 0; - _memory_global_reserve_count = 0; - _memory_global_reserve = 0; - } - atomic_store32_release(&_memory_global_lock, 0); - - // Free all thread caches and fully free spans - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t *heap = _memory_heaps[list_idx]; - while (heap) { - heap_t *next_heap = heap->next_heap; - heap->finalize = 1; - _rpmalloc_heap_global_finalize(heap); - heap = next_heap; - } - } - -#if ENABLE_GLOBAL_CACHE - // Free global caches - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) - _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); -#endif - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - pthread_key_delete(_memory_thread_heap); -#endif -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsFree(fls_key); - fls_key = 0; -#endif -#if ENABLE_STATISTICS - // If you hit these asserts you probably have memory leaks (perhaps global - // scope data doing dynamic allocations) or double frees in your code - rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); - rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, - "Memory leak detected"); -#endif - - _rpmalloc_initialized = 0; -} - -//! Initialize thread, assign heap -extern inline void rpmalloc_thread_initialize(void) { - if (!get_thread_heap_raw()) { - heap_t *heap = _rpmalloc_heap_allocate(0); - if (heap) { - _rpmalloc_stat_inc(&_memory_active_heaps); - set_thread_heap(heap); -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, heap); -#endif - } - } -} - -//! Finalize thread, orphan heap -void rpmalloc_thread_finalize(int release_caches) { - heap_t *heap = get_thread_heap_raw(); - if (heap) - _rpmalloc_heap_release_raw(heap, release_caches); - set_thread_heap(0); -#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, 0); -#endif -} - -int rpmalloc_is_thread_initialized(void) { - return (get_thread_heap_raw() != 0) ? 1 : 0; -} - -const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; } - -// Extern interface - -extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_allocate(heap, size); -} - -extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); } - -extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - heap_t *heap = get_thread_heap(); - void *block = _rpmalloc_allocate(heap, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_reallocate(heap, ptr, size, 0, 0); -} - -extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment, - size_t size, size_t oldsize, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - heap_t *heap = get_thread_heap(); - return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, - flags); -} - -extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) { - heap_t *heap = get_thread_heap(); - return _rpmalloc_aligned_allocate(heap, alignment, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpaligned_calloc(size_t alignment, size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - void *block = rpaligned_alloc(alignment, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment, - size_t size) { - return rpaligned_alloc(alignment, size); -} - -extern inline int rpposix_memalign(void **memptr, size_t alignment, - size_t size) { - if (memptr) - *memptr = rpaligned_alloc(alignment, size); - else - return EINVAL; - return *memptr ? 0 : ENOMEM; -} - -extern inline size_t rpmalloc_usable_size(void *ptr) { - return (ptr ? _rpmalloc_usable_size(ptr) : 0); -} - -extern inline void rpmalloc_thread_collect(void) {} - -void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) { - memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); - heap_t *heap = get_thread_heap_raw(); - if (!heap) - return; - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - size_class_t *size_class = _memory_size_class + iclass; - span_t *span = heap->size_class[iclass].partial_span; - while (span) { - size_t free_count = span->list_size; - size_t block_count = size_class->block_count; - if (span->free_list_limit < block_count) - block_count = span->free_list_limit; - free_count += (block_count - span->used_count); - stats->sizecache += free_count * size_class->block_size; - span = span->next; - } - } - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size; - } -#endif - - span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred); - while (deferred) { - if (deferred->size_class != SIZE_CLASS_HUGE) - stats->spancache += (size_t)deferred->span_count * _memory_span_size; - deferred = (span_t *)deferred->free_list; - } - -#if ENABLE_STATISTICS - stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); - stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); - - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - stats->span_use[iclass].current = - (size_t)atomic_load32(&heap->span_use[iclass].current); - stats->span_use[iclass].peak = - (size_t)atomic_load32(&heap->span_use[iclass].high); - stats->span_use[iclass].to_global = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); - stats->span_use[iclass].from_global = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); - stats->span_use[iclass].to_cache = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); - stats->span_use[iclass].from_cache = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); - stats->span_use[iclass].to_reserved = - (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); - stats->span_use[iclass].from_reserved = - (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); - stats->span_use[iclass].map_calls = - (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); - } - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - stats->size_use[iclass].alloc_current = - (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); - stats->size_use[iclass].alloc_peak = - (size_t)heap->size_class_use[iclass].alloc_peak; - stats->size_use[iclass].alloc_total = - (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); - stats->size_use[iclass].free_total = - (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); - stats->size_use[iclass].spans_to_cache = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); - stats->size_use[iclass].spans_from_cache = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); - stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32( - &heap->size_class_use[iclass].spans_from_reserved); - stats->size_use[iclass].map_calls = - (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); - } -#endif -} - -void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) { - memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); -#if ENABLE_STATISTICS - stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; - stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; - stats->mapped_total = - (size_t)atomic_load32(&_mapped_total) * _memory_page_size; - stats->unmapped_total = - (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - stats->huge_alloc = - (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; - stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; -#endif -#if ENABLE_GLOBAL_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - global_cache_t *cache = &_memory_span_cache[iclass]; - while (!atomic_cas32_acquire(&cache->lock, 1, 0)) - _rpmalloc_spin(); - uint32_t count = cache->count; -#if ENABLE_UNLIMITED_CACHE - span_t *current_span = cache->overflow; - while (current_span) { - ++count; - current_span = current_span->next; - } -#endif - atomic_store32_release(&cache->lock, 0); - stats->cached += count * (iclass + 1) * _memory_span_size; - } -#endif -} - -#if ENABLE_STATISTICS - -static void _memory_heap_dump_statistics(heap_t *heap, void *file) { - fprintf(file, "Heap %d stats:\n", heap->id); - fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize " - "BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB " - "FromCacheMiB FromReserveMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) - continue; - fprintf( - file, - "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu " - "%9u\n", - (uint32_t)iclass, - atomic_load32(&heap->size_class_use[iclass].alloc_current), - heap->size_class_use[iclass].alloc_peak, - atomic_load32(&heap->size_class_use[iclass].alloc_total), - atomic_load32(&heap->size_class_use[iclass].free_total), - _memory_size_class[iclass].block_size, - _memory_size_class[iclass].block_count, - atomic_load32(&heap->size_class_use[iclass].spans_current), - heap->size_class_use[iclass].spans_peak, - ((size_t)heap->size_class_use[iclass].alloc_peak * - (size_t)_memory_size_class[iclass].block_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * - _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * - _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32( - &heap->size_class_use[iclass].spans_from_reserved) * - _memory_span_size) / - (size_t)(1024 * 1024), - atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); - } - fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB " - "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB " - "FromGlobalMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (!atomic_load32(&heap->span_use[iclass].high) && - !atomic_load32(&heap->span_use[iclass].spans_map_calls)) - continue; - fprintf( - file, - "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", - (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current), - atomic_load32(&heap->span_use[iclass].high), - atomic_load32(&heap->span_use[iclass].spans_deferred), - ((size_t)atomic_load32(&heap->span_use[iclass].high) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), -#if ENABLE_THREAD_CACHE - (unsigned int)(!iclass ? heap->span_cache.count - : heap->span_large_cache[iclass - 1].count), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), -#else - 0, (size_t)0, (size_t)0, -#endif - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * - (iclass + 1) * _memory_span_size) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), - ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * - (size_t)_memory_span_size * (iclass + 1)) / - (size_t)(1024 * 1024), - atomic_load32(&heap->span_use[iclass].spans_map_calls)); - } - fprintf(file, "Full spans: %zu\n", heap->full_span_count); - fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); - fprintf( - file, "%17zu %17zu\n", - (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), - (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); -} - -#endif - -void rpmalloc_dump_statistics(void *file) { -#if ENABLE_STATISTICS - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t *heap = _memory_heaps[list_idx]; - while (heap) { - int need_dump = 0; - for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); - ++iclass) { - if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { - rpmalloc_assert( - !atomic_load32(&heap->size_class_use[iclass].free_total), - "Heap statistics counter mismatch"); - rpmalloc_assert( - !atomic_load32(&heap->size_class_use[iclass].spans_map_calls), - "Heap statistics counter mismatch"); - continue; - } - need_dump = 1; - } - for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); - ++iclass) { - if (!atomic_load32(&heap->span_use[iclass].high) && - !atomic_load32(&heap->span_use[iclass].spans_map_calls)) - continue; - need_dump = 1; - } - if (need_dump) - _memory_heap_dump_statistics(heap, file); - heap = heap->next_heap; - } - } - fprintf(file, "Global stats:\n"); - size_t huge_current = - (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; - size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; - fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); - fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), - huge_peak / (size_t)(1024 * 1024)); - -#if ENABLE_GLOBAL_CACHE - fprintf(file, "GlobalCacheMiB\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - global_cache_t *cache = _memory_span_cache + iclass; - size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; - - size_t global_overflow_cache = 0; - span_t *span = cache->overflow; - while (span) { - global_overflow_cache += iclass * _memory_span_size; - span = span->next; - } - if (global_cache || global_overflow_cache || cache->insert_count || - cache->extract_count) - fprintf(file, - "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", - iclass + 1, global_cache / (size_t)(1024 * 1024), - global_overflow_cache / (size_t)(1024 * 1024), - cache->insert_count, cache->extract_count); - } -#endif - - size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; - size_t mapped_os = - (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; - size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; - size_t mapped_total = - (size_t)atomic_load32(&_mapped_total) * _memory_page_size; - size_t unmapped_total = - (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - fprintf( - file, - "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); - fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", - mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), - mapped_peak / (size_t)(1024 * 1024), - mapped_total / (size_t)(1024 * 1024), - unmapped_total / (size_t)(1024 * 1024)); - - fprintf(file, "\n"); -#if 0 - int64_t allocated = atomic_load64(&_allocation_counter); - int64_t deallocated = atomic_load64(&_deallocation_counter); - fprintf(file, "Allocation count: %lli\n", allocated); - fprintf(file, "Deallocation count: %lli\n", deallocated); - fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); - fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); - fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); -#endif -#endif - (void)sizeof(file); -} - -#if RPMALLOC_FIRST_CLASS_HEAPS - -extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) { - // Must be a pristine heap from newly mapped memory pages, or else memory - // blocks could already be allocated from the heap which would (wrongly) be - // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps - // guaranteed to be pristine from the dedicated orphan list can be used. - heap_t *heap = _rpmalloc_heap_allocate(1); - rpmalloc_assume(heap != NULL); - heap->owner_thread = 0; - _rpmalloc_stat_inc(&_memory_active_heaps); - return heap; -} - -extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) { - if (heap) - _rpmalloc_heap_release(heap, 1, 1); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_allocate(heap, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, - size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_aligned_allocate(heap, alignment, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) { - return rpmalloc_heap_aligned_calloc(heap, 0, num, size); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, - size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - void *block = _rpmalloc_aligned_allocate(heap, alignment, total); - if (block) - memset(block, 0, total); - return block; -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - return _rpmalloc_reallocate(heap, ptr, size, 0, flags); -} - -extern inline RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr, - size_t alignment, size_t size, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); -} - -extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) { - (void)sizeof(heap); - _rpmalloc_deallocate(ptr); -} - -extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) { - span_t *span; - span_t *next_span; - - _rpmalloc_heap_cache_adopt_deferred(heap, 0); - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - span = heap->size_class[iclass].partial_span; - while (span) { - next_span = span->next; - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - heap->size_class[iclass].partial_span = 0; - span = heap->full_span[iclass]; - while (span) { - next_span = span->next; - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - - span = heap->size_class[iclass].cache; - if (span) - _rpmalloc_heap_cache_insert(heap, span); - heap->size_class[iclass].cache = 0; - } - memset(heap->size_class, 0, sizeof(heap->size_class)); - memset(heap->full_span, 0, sizeof(heap->full_span)); - - span = heap->large_huge_span; - while (span) { - next_span = span->next; - if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) - _rpmalloc_deallocate_huge(span); - else - _rpmalloc_heap_cache_insert(heap, span); - span = next_span; - } - heap->large_huge_span = 0; - heap->full_span_count = 0; - -#if ENABLE_THREAD_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_cache_t *span_cache; - if (!iclass) - span_cache = &heap->span_cache; - else - span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); - if (!span_cache->count) - continue; -#if ENABLE_GLOBAL_CACHE - _rpmalloc_stat_add64(&heap->thread_to_global, - span_cache->count * (iclass + 1) * _memory_span_size); - _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, - span_cache->count); - _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, - span_cache->count); -#else - for (size_t ispan = 0; ispan < span_cache->count; ++ispan) - _rpmalloc_span_unmap(span_cache->span[ispan]); -#endif - span_cache->count = 0; - } -#endif - -#if ENABLE_STATISTICS - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); - atomic_store32(&heap->size_class_use[iclass].spans_current, 0); - } - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - atomic_store32(&heap->span_use[iclass].current, 0); - } -#endif -} - -extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) { - heap_t *prev_heap = get_thread_heap_raw(); - if (prev_heap != heap) { - set_thread_heap(heap); - if (prev_heap) - rpmalloc_heap_release(prev_heap); - } -} - -extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) { - // Grab the span, and then the heap from the span - span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); - if (span) { - return span->heap; - } - return 0; -} - -#endif - -#if ENABLE_PRELOAD || ENABLE_OVERRIDE - -#include "malloc.c" - -#endif - -void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); } +//===---------------------- rpmalloc.c ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#include "rpmalloc.h" + +//////////// +/// +/// Build time configurable limits +/// +////// + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-macros" +#pragma clang diagnostic ignored "-Wunused-function" +#if __has_warning("-Wreserved-identifier") +#pragma clang diagnostic ignored "-Wreserved-identifier" +#endif +#if __has_warning("-Wstatic-in-inline") +#pragma clang diagnostic ignored "-Wstatic-in-inline" +#endif +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-macros" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#if !defined(__has_builtin) +#define __has_builtin(b) 0 +#endif + +#if defined(__GNUC__) || defined(__clang__) + +#if __has_builtin(__builtin_memcpy_inline) +#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s) +#else +#define _rpmalloc_memcpy_const(x, y, s) \ + do { \ + _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ + "len must be a constant integer"); \ + memcpy(x, y, s); \ + } while (0) +#endif + +#if __has_builtin(__builtin_memset_inline) +#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s) +#else +#define _rpmalloc_memset_const(x, y, s) \ + do { \ + _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), \ + "len must be a constant integer"); \ + memset(x, y, s); \ + } while (0) +#endif +#else +#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s) +#define _rpmalloc_memset_const(x, y, s) memset(x, y, s) +#endif + +#if __has_builtin(__builtin_assume) +#define rpmalloc_assume(cond) __builtin_assume(cond) +#elif defined(__GNUC__) +#define rpmalloc_assume(cond) \ + do { \ + if (!__builtin_expect(cond, 0)) \ + __builtin_unreachable(); \ + } while (0) +#elif defined(_MSC_VER) +#define rpmalloc_assume(cond) __assume(cond) +#else +#define rpmalloc_assume(cond) 0 +#endif + +#ifndef HEAP_ARRAY_SIZE +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 47 +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif +#ifndef ENABLE_OVERRIDE +//! Override standard library malloc/free and new/delete entry points +#define ENABLE_OVERRIDE 0 +#endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 0 +#endif +#ifndef DISABLE_UNMAP +//! Disable unmapping memory pages (also enables unlimited cache) +#define DISABLE_UNMAP 0 +#endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Enable unlimited global cache (no unmapping until finalization) +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive thread cache size based on use heuristics +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory (default +//! values yield 4MiB here) +#define DEFAULT_SPAN_MAP_COUNT 64 +#endif +#ifndef GLOBAL_CACHE_MULTIPLIER +//! Multiplier for global cache +#define GLOBAL_CACHE_MULTIPLIER 8 +#endif + +#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE +#error Must use global cache if unmap is disabled +#endif + +#if DISABLE_UNMAP +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 1 +#endif + +#if !ENABLE_GLOBAL_CACHE +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE +#undef ENABLE_ADAPTIVE_THREAD_CACHE +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) +#define PLATFORM_WINDOWS 1 +#define PLATFORM_POSIX 0 +#else +#define PLATFORM_WINDOWS 0 +#define PLATFORM_POSIX 1 +#endif + +/// Platform and arch specifics +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(disable : 5105) +#ifndef FORCEINLINE +#define FORCEINLINE inline __forceinline +#endif +#define _Static_assert static_assert +#else +#ifndef FORCEINLINE +#define FORCEINLINE inline __attribute__((__always_inline__)) +#endif +#endif +#if PLATFORM_WINDOWS +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#if ENABLE_VALIDATE_ARGS +#include +#endif +#else +#include +#include +#include +#include +#if defined(__linux__) || defined(__ANDROID__) +#include +#if !defined(PR_SET_VMA) +#define PR_SET_VMA 0x53564d41 +#define PR_SET_VMA_ANON_NAME 0 +#endif +#endif +#if defined(__APPLE__) +#include +#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR +#include +#include +#endif +#include +#endif +#if defined(__HAIKU__) || defined(__TINYC__) +#include +#endif +#endif + +#include +#include +#include + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +#endif + +#if PLATFORM_POSIX +#include +#include +#ifdef __FreeBSD__ +#include +#define MAP_HUGETLB MAP_ALIGNED_SUPER +#ifndef PROT_MAX +#define PROT_MAX(f) 0 +#endif +#else +#define PROT_MAX(f) 0 +#endif +#ifdef __sun +extern int madvise(caddr_t, size_t, int); +#endif +#ifndef MAP_UNINITIALIZED +#define MAP_UNINITIALIZED 0 +#endif +#endif +#include + +#if ENABLE_ASSERTS +#undef NDEBUG +#if defined(_MSC_VER) && !defined(_DEBUG) +#define _DEBUG +#endif +#include +#define RPMALLOC_TOSTRING_M(x) #x +#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) +#define rpmalloc_assert(truth, message) \ + do { \ + if (!(truth)) { \ + if (_memory_config.error_callback) { \ + _memory_config.error_callback(message " (" RPMALLOC_TOSTRING( \ + truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ + } else { \ + assert((truth) && message); \ + } \ + } \ + } while (0) +#else +#define rpmalloc_assert(truth, message) \ + do { \ + } while (0) +#endif +#if ENABLE_STATISTICS +#include +#endif + +////// +/// +/// Atomic access abstraction (since MSVC does not do C11 yet) +/// +////// + +#if defined(_MSC_VER) && !defined(__clang__) + +typedef volatile long atomic32_t; +typedef volatile long long atomic64_t; +typedef volatile void *atomicptr_t; + +static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; } +static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { + *dst = val; +} +static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { + return (int32_t)InterlockedIncrement(val); +} +static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { + return (int32_t)InterlockedDecrement(val); +} +static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { + return (int32_t)InterlockedExchangeAdd(val, add) + add; +} +static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, + int32_t ref) { + return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; +} +static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { + *dst = val; +} +static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; } +static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { + return (int64_t)InterlockedExchangeAdd64(val, add) + add; +} +static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { + return (void *)*src; +} +static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { + *dst = val; +} +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { + *dst = val; +} +static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, + void *val) { + return (void *)InterlockedExchangePointer((void *volatile *)dst, val); +} +static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { + return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) == + ref) + ? 1 + : 0; +} + +#define EXPECTED(x) (x) +#define UNEXPECTED(x) (x) + +#else + +#include + +typedef volatile _Atomic(int32_t) atomic32_t; +typedef volatile _Atomic(int64_t) atomic64_t; +typedef volatile _Atomic(void *) atomicptr_t; + +static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { + return atomic_load_explicit(src, memory_order_relaxed); +} +static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) { + atomic_store_explicit(dst, val, memory_order_relaxed); +} +static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) { + return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; +} +static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) { + return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; +} +static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) { + return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; +} +static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val, + int32_t ref) { + return atomic_compare_exchange_weak_explicit( + dst, &ref, val, memory_order_acquire, memory_order_relaxed); +} +static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) { + atomic_store_explicit(dst, val, memory_order_release); +} +static FORCEINLINE int64_t atomic_load64(atomic64_t *val) { + return atomic_load_explicit(val, memory_order_relaxed); +} +static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) { + return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; +} +static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) { + return atomic_load_explicit(src, memory_order_relaxed); +} +static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) { + atomic_store_explicit(dst, val, memory_order_relaxed); +} +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) { + atomic_store_explicit(dst, val, memory_order_release); +} +static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst, + void *val) { + return atomic_exchange_explicit(dst, val, memory_order_acquire); +} +static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) { + return atomic_compare_exchange_weak_explicit( + dst, &ref, val, memory_order_relaxed, memory_order_relaxed); +} + +#define EXPECTED(x) __builtin_expect((x), 1) +#define UNEXPECTED(x) __builtin_expect((x), 0) + +#endif + +//////////// +/// +/// Statistics related functions (evaluate to nothing when statistics not +/// enabled) +/// +////// + +#if ENABLE_STATISTICS +#define _rpmalloc_stat_inc(counter) atomic_incr32(counter) +#define _rpmalloc_stat_dec(counter) atomic_decr32(counter) +#define _rpmalloc_stat_add(counter, value) \ + atomic_add32(counter, (int32_t)(value)) +#define _rpmalloc_stat_add64(counter, value) \ + atomic_add64(counter, (int64_t)(value)) +#define _rpmalloc_stat_add_peak(counter, value, peak) \ + do { \ + int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); \ + if (_cur_count > (peak)) \ + peak = _cur_count; \ + } while (0) +#define _rpmalloc_stat_sub(counter, value) \ + atomic_add32(counter, -(int32_t)(value)) +#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ + do { \ + int32_t alloc_current = \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ + } while (0) +#define _rpmalloc_stat_inc_free(heap, class_idx) \ + do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ + } while (0) +#else +#define _rpmalloc_stat_inc(counter) \ + do { \ + } while (0) +#define _rpmalloc_stat_dec(counter) \ + do { \ + } while (0) +#define _rpmalloc_stat_add(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_add64(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_add_peak(counter, value, peak) \ + do { \ + } while (0) +#define _rpmalloc_stat_sub(counter, value) \ + do { \ + } while (0) +#define _rpmalloc_stat_inc_alloc(heap, class_idx) \ + do { \ + } while (0) +#define _rpmalloc_stat_inc_free(heap, class_idx) \ + do { \ + } while (0) +#endif + +/// +/// Preconfigured limits and sizes +/// + +//! Granularity of a small allocation block (must be power of two) +#define SMALL_GRANULARITY 16 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 4 +//! Number of small block size classes +#define SMALL_CLASS_COUNT 65 +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medium granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 61 +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) +//! Number of large block size classes +#define LARGE_CLASS_COUNT 63 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT \ + (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT \ + ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power +//! of two) +#define SPAN_HEADER_SIZE 128 +//! Number of spans in thread cache +#define MAX_THREAD_SPAN_CACHE 400 +//! Number of spans to transfer between thread and global cache +#define THREAD_SPAN_CACHE_TRANSFER 64 +//! Number of spans in thread cache for large spans (must be greater than +//! LARGE_CLASS_COUNT / 2) +#define MAX_THREAD_SPAN_LARGE_CACHE 100 +//! Number of spans to transfer between thread and global cache for large spans +#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 + +_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, + "Small granularity must be power of two"); +_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, + "Span header size must be power of two"); + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size) +#endif + +#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) \ + (ptrdiff_t)((const char *)(first) - (const char *)(second)) + +#define INVALID_POINTER ((void *)((uintptr_t) - 1)) + +#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT +#define SIZE_CLASS_HUGE ((uint32_t) - 1) + +//////////// +/// +/// Data types +/// +////// + +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Span list +typedef struct span_list_t span_list_t; +//! Span active data +typedef struct span_active_t span_active_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Global cache +typedef struct global_cache_t global_cache_t; + +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1U +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2U +//! Flag indicating span has blocks with increased alignment +#define SPAN_FLAG_ALIGNED_BLOCKS 4U +//! Flag indicating an unmapped master span +#define SPAN_FLAG_UNMAPPED_MASTER 8U + +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS +struct span_use_t { + //! Current number of spans used (actually used, not in cache) + atomic32_t current; + //! High water mark of spans used + atomic32_t high; +#if ENABLE_STATISTICS + //! Number of spans in deferred list + atomic32_t spans_deferred; + //! Number of spans transitioned to global cache + atomic32_t spans_to_global; + //! Number of spans transitioned from global cache + atomic32_t spans_from_global; + //! Number of spans transitioned to thread cache + atomic32_t spans_to_cache; + //! Number of spans transitioned from thread cache + atomic32_t spans_from_cache; + //! Number of spans transitioned to reserved state + atomic32_t spans_to_reserved; + //! Number of spans transitioned from reserved state + atomic32_t spans_from_reserved; + //! Number of raw memory map calls + atomic32_t spans_map_calls; +#endif +}; +typedef struct span_use_t span_use_t; +#endif + +#if ENABLE_STATISTICS +struct size_class_use_t { + //! Current number of allocations + atomic32_t alloc_current; + //! Peak number of allocations + int32_t alloc_peak; + //! Total number of allocations + atomic32_t alloc_total; + //! Total number of frees + atomic32_t free_total; + //! Number of spans in use + atomic32_t spans_current; + //! Number of spans transitioned to cache + int32_t spans_peak; + //! Number of spans transitioned to cache + atomic32_t spans_to_cache; + //! Number of spans transitioned from cache + atomic32_t spans_from_cache; + //! Number of spans transitioned from reserved state + atomic32_t spans_from_reserved; + //! Number of spans mapped + atomic32_t spans_map_calls; + int32_t unused; +}; +typedef struct size_class_use_t size_class_use_t; +#endif + +// A span can either represent a single span of memory pages with size declared +// by span_map_count configuration variable, or a set of spans in a continuous +// region, a super span. Any reference to the term "span" usually refers to both +// a single span or a super span. A super span can further be divided into +// multiple spans (or this, super spans), where the first (super)span is the +// master and subsequent (super)spans are subspans. The master span keeps track +// of how many subspans that are still alive and mapped in virtual memory, and +// once all subspans and master have been unmapped the entire superspan region +// is released and unmapped (on Windows for example, the entire superspan range +// has to be released in the same call to release the virtual memory range, but +// individual subranges can be decommitted individually to reduce physical +// memory use). +struct span_t { + //! Free list + void *free_list; + //! Total block count of size class + uint32_t block_count; + //! Size class + uint32_t size_class; + //! Index of last block initialized in free list + uint32_t free_list_limit; + //! Number of used blocks remaining when in partial state + uint32_t used_count; + //! Deferred free list + atomicptr_t free_list_deferred; + //! Size of deferred free list, or list of spans when part of a cache list + uint32_t list_size; + //! Size of a block + uint32_t block_size; + //! Flags and counters + uint32_t flags; + //! Number of spans + uint32_t span_count; + //! Total span counter for master spans + uint32_t total_spans; + //! Offset from master span for subspans + uint32_t offset_from_master; + //! Remaining span counter, for master spans + atomic32_t remaining_spans; + //! Alignment offset + uint32_t align_offset; + //! Owning heap + heap_t *heap; + //! Next span + span_t *next; + //! Previous span + span_t *prev; +}; +_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +struct span_cache_t { + size_t count; + span_t *span[MAX_THREAD_SPAN_CACHE]; +}; +typedef struct span_cache_t span_cache_t; + +struct span_large_cache_t { + size_t count; + span_t *span[MAX_THREAD_SPAN_LARGE_CACHE]; +}; +typedef struct span_large_cache_t span_large_cache_t; + +struct heap_size_class_t { + //! Free list of active span + void *free_list; + //! Double linked list of partially used spans with free blocks. + // Previous span pointer in head points to tail span of list. + span_t *partial_span; + //! Early level cache of fully free spans + span_t *cache; +}; +typedef struct heap_size_class_t heap_size_class_t; + +// Control structure for a heap, either a thread heap or a first class heap if +// enabled +struct heap_t { + //! Owning thread ID + uintptr_t owner_thread; + //! Free lists for each size class + heap_size_class_t size_class[SIZE_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, single span + span_cache_t span_cache; +#endif + //! List of deferred free spans (single linked list) + atomicptr_t span_free_deferred; + //! Number of full spans + size_t full_span_count; + //! Mapped but unused spans + span_t *span_reserve; + //! Master span for mapped but unused spans + span_t *span_reserve_master; + //! Number of mapped but unused spans + uint32_t spans_reserved; + //! Child count + atomic32_t child_count; + //! Next heap in id list + heap_t *next_heap; + //! Next heap in orphan list + heap_t *next_orphan; + //! Heap ID + int32_t id; + //! Finalization state flag + int finalize; + //! Master heap owning the memory pages + heap_t *master_heap; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, large spans with > 1 span count + span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; +#endif +#if RPMALLOC_FIRST_CLASS_HEAPS + //! Double linked list of fully utilized spans with free blocks for each size + //! class. + // Previous span pointer in head points to tail span of list. + span_t *full_span[SIZE_CLASS_COUNT]; + //! Double linked list of large and huge spans allocated by this heap + span_t *large_huge_span; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif +#if ENABLE_STATISTICS + //! Allocation stats per size class + size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; + //! Number of bytes transitioned thread -> global + atomic64_t thread_to_global; + //! Number of bytes transitioned global -> thread + atomic64_t global_to_thread; +#endif +}; + +// Size class for defining a block size bucket +struct size_class_t { + //! Size of blocks in this class + uint32_t block_size; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +struct global_cache_t { + //! Cache lock + atomic32_t lock; + //! Cache count + uint32_t count; +#if ENABLE_STATISTICS + //! Insert count + size_t insert_count; + //! Extract count + size_t extract_count; +#endif + //! Cached spans + span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; + //! Unlimited cache overflow + span_t *overflow; +}; + +//////////// +/// +/// Global data +/// +////// + +//! Default span size (64KiB) +#define _memory_default_span_size (64 * 1024) +#define _memory_default_span_size_shift 16 +#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) + +//! Initialized flag +static int _rpmalloc_initialized; +//! Main thread ID +static uintptr_t _rpmalloc_main_thread_id; +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +#if RPMALLOC_CONFIGURABLE +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; +#else +//! Hardwired span size +#define _memory_span_size _memory_default_span_size +#define _memory_span_size_shift _memory_default_span_size_shift +#define _memory_span_mask _memory_default_span_mask +#endif +//! Number of spans to map in each map call +static size_t _memory_span_map_count; +//! Number of spans to keep reserved in each heap +static size_t _memory_heap_reserve_count; +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; +//! Heap ID counter +static atomic32_t _memory_heap_id; +//! Huge page support +static int _memory_huge_pages; +#if ENABLE_GLOBAL_CACHE +//! Global span cache +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif +//! Global reserved spans +static span_t *_memory_global_reserve; +//! Global reserved count +static size_t _memory_global_reserve_count; +//! Global reserved master +static span_t *_memory_global_reserve_master; +//! All heaps +static heap_t *_memory_heaps[HEAP_ARRAY_SIZE]; +//! Used to restrict access to mapping memory for huge pages +static atomic32_t _memory_global_lock; +//! Orphaned heaps +static heap_t *_memory_orphan_heaps; +#if RPMALLOC_FIRST_CLASS_HEAPS +//! Orphaned heaps (first class heaps) +static heap_t *_memory_first_class_orphan_heaps; +#endif +#if ENABLE_STATISTICS +//! Allocations counter +static atomic64_t _allocation_counter; +//! Deallocations counter +static atomic64_t _deallocation_counter; +//! Active heap count +static atomic32_t _memory_active_heaps; +//! Number of currently mapped memory pages +static atomic32_t _mapped_pages; +//! Peak number of concurrently mapped memory pages +static int32_t _mapped_pages_peak; +//! Number of mapped master spans +static atomic32_t _master_spans; +//! Number of unmapped dangling master spans +static atomic32_t _unmapped_master_spans; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +//! Number of currently mapped memory pages in OS calls +static atomic32_t _mapped_pages_os; +//! Number of currently allocated pages in huge allocations +static atomic32_t _huge_pages_current; +//! Peak number of currently allocated pages in huge allocations +static int32_t _huge_pages_peak; +#endif + +//////////// +/// +/// Thread local heap and ID +/// +////// + +//! Current thread heap +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) +static pthread_key_t _memory_thread_heap; +#else +#ifdef _MSC_VER +#define _Thread_local __declspec(thread) +#define TLS_MODEL +#else +#ifndef __HAIKU__ +#define TLS_MODEL __attribute__((tls_model("initial-exec"))) +#else +#define TLS_MODEL +#endif +#if !defined(__clang__) && defined(__GNUC__) +#define _Thread_local __thread +#endif +#endif +static _Thread_local heap_t *_memory_thread_heap TLS_MODEL; +#endif + +static inline heap_t *get_thread_heap_raw(void) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Get the current thread heap +static inline heap_t *get_thread_heap(void) { + heap_t *heap = get_thread_heap_raw(); +#if ENABLE_PRELOAD + if (EXPECTED(heap != 0)) + return heap; + rpmalloc_initialize(); + return get_thread_heap_raw(); +#else + return heap; +#endif +} + +//! Fast thread ID +static inline uintptr_t get_thread_id(void) { +#if defined(_WIN32) + return (uintptr_t)((void *)NtCurrentTeb()); +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) + uintptr_t tid; +#if defined(__i386__) + __asm__("movl %%gs:0, %0" : "=r"(tid) : :); +#elif defined(__x86_64__) +#if defined(__MACH__) + __asm__("movq %%gs:0, %0" : "=r"(tid) : :); +#else + __asm__("movq %%fs:0, %0" : "=r"(tid) : :); +#endif +#elif defined(__arm__) + __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); +#elif defined(__aarch64__) +#if defined(__MACH__) + // tpidr_el0 likely unused, always return 0 on iOS + __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); +#else + __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); +#endif +#else +#error This platform needs implementation of get_thread_id() +#endif + return tid; +#else +#error This platform needs implementation of get_thread_id() +#endif +} + +//! Set the current thread heap +static void set_thread_heap(heap_t *heap) { +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif + if (heap) + heap->owner_thread = get_thread_id(); +} + +//! Set main thread ID +extern void rpmalloc_set_main_thread(void); + +void rpmalloc_set_main_thread(void) { + _rpmalloc_main_thread_id = get_thread_id(); +} + +static void _rpmalloc_spin(void) { +#if defined(_MSC_VER) +#if defined(_M_ARM64) + __yield(); +#else + _mm_pause(); +#endif +#elif defined(__x86_64__) || defined(__i386__) + __asm__ volatile("pause" ::: "memory"); +#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) + __asm__ volatile("yield" ::: "memory"); +#elif defined(__powerpc__) || defined(__powerpc64__) + // No idea if ever been compiled in such archs but ... as precaution + __asm__ volatile("or 27,27,27"); +#elif defined(__sparc__) + __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); +#else + struct timespec ts = {0}; + nanosleep(&ts, 0); +#endif +} + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +static void NTAPI _rpmalloc_thread_destructor(void *value) { +#if ENABLE_OVERRIDE + // If this is called on main thread it means rpmalloc_finalize + // has not been called and shutdown is forced (through _exit) or unclean + if (get_thread_id() == _rpmalloc_main_thread_id) + return; +#endif + if (value) + rpmalloc_thread_finalize(1); +} +#endif + +//////////// +/// +/// Low level memory map/unmap +/// +////// + +static void _rpmalloc_set_name(void *address, size_t size) { +#if defined(__linux__) || defined(__ANDROID__) + const char *name = _memory_huge_pages ? _memory_config.huge_page_name + : _memory_config.page_name; + if (address == MAP_FAILED || !name) + return; + // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails + // (e.g. invalid name) it is a no-op basically. + (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, + (uintptr_t)name); +#else + (void)sizeof(size); + (void)sizeof(address); +#endif +} + +//! Map more virtual memory +// size is number of bytes to map +// offset receives the offset in bytes from start of mapped region +// returns address to start of mapped region to use +static void *_rpmalloc_mmap(size_t size, size_t *offset) { + rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); + void *address = _memory_config.memory_map(size, offset); + if (EXPECTED(address != 0)) { + _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), + _mapped_pages_peak); + _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); + } + return address; +} + +//! Unmap virtual memory +// address is the memory address to unmap, as returned from _memory_map +// size is the number of bytes to unmap, which might be less than full region +// for a partial unmap offset is the offset in bytes to the actual mapped +// region, as set by _memory_map release is set to 0 for partial unmap, or size +// of entire range for a full unmap +static void _rpmalloc_unmap(void *address, size_t size, size_t offset, + size_t release) { + rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), + "Invalid unmap size"); + if (release) { + rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); + _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); + } + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Default implementation to map new pages to virtual memory +static void *_rpmalloc_mmap_os(size_t size, size_t *offset) { + // Either size is a heap (a single page) or a (multiple) span - we only need + // to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && + (_memory_span_size > _memory_map_granularity)) + ? _memory_span_size + : 0; + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); +#if PLATFORM_WINDOWS + // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not + // allocated unless/until the virtual addresses are actually accessed" + void *ptr = VirtualAlloc(0, size + padding, + (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + if (!ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else { + rpmalloc_assert(ptr, "Failed to map virtual memory block"); + } + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +#elif defined(MAP_HUGETLB) + void *ptr = mmap(0, size + padding, + PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), + (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +#if defined(MADV_HUGEPAGE) + // In some configurations, huge pages allocations might fail thus + // we fallback to normal allocations and promote the region as transparent + // huge page + if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { + ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ptr && ptr != MAP_FAILED) { + int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); + (void)prm; + rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); + } + } +#endif + _rpmalloc_set_name(ptr, size + padding); +#elif defined(MAP_ALIGNED) + const size_t align = + (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); + void *ptr = + mmap(0, size + padding, PROT_READ | PROT_WRITE, + (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); +#elif defined(MAP_ALIGN) + caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); + void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, + (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); +#else + void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if ((ptr == MAP_FAILED) || !ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else if (errno != ENOMEM) { + rpmalloc_assert((ptr != MAP_FAILED) && ptr, + "Failed to map virtual memory block"); + } + return 0; + } +#endif + _rpmalloc_stat_add(&_mapped_pages_os, + (int32_t)((size + padding) >> _memory_page_size_shift)); + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + rpmalloc_assert(final_padding <= _memory_span_size, + "Internal failure in padding"); + rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); + rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + rpmalloc_assert((size < _memory_span_size) || + !((uintptr_t)ptr & ~_memory_span_mask), + "Internal failure in padding"); + return ptr; +} + +//! Default implementation to unmap pages from virtual memory +static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset, + size_t release) { + rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), + "Invalid unmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); + if ((release >= _memory_span_size) && + (_memory_span_size > _memory_map_granularity)) { + // Padding is always one span size + release += _memory_span_size; + } + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, + release ? MEM_RELEASE : MEM_DECOMMIT)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } + } else { +#if defined(MADV_FREE_REUSABLE) + int ret; + while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && + (errno == EAGAIN)) + errno = 0; + if ((ret == -1) && (errno != 0)) { +#elif defined(MADV_DONTNEED) + if (madvise(address, size, MADV_DONTNEED)) { +#elif defined(MADV_PAGEOUT) + if (madvise(address, size, MADV_PAGEOUT)) { +#elif defined(MADV_FREE) + if (madvise(address, size, MADV_FREE)) { +#else + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { +#endif + rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); + } + } +#endif +#endif + if (release) + _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); +} + +static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, + span_t *subspan, + size_t span_count); + +//! Use global reserved spans to fulfill a memory map request (reserve size must +//! be checked by caller) +static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) { + span_t *span = _memory_global_reserve; + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, + span, span_count); + _memory_global_reserve_count -= span_count; + if (_memory_global_reserve_count) + _memory_global_reserve = + (span_t *)pointer_offset(span, span_count << _memory_span_size_shift); + else + _memory_global_reserve = 0; + return span; +} + +//! Store the given spans as global reserve (must only be called from within new +//! heap allocation, not thread safe) +static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve, + size_t reserve_span_count) { + _memory_global_reserve_master = master; + _memory_global_reserve_count = reserve_span_count; + _memory_global_reserve = reserve; +} + +//////////// +/// +/// Span linked list management +/// +////// + +//! Add a span to double linked list at the head +static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) { + if (*head) + (*head)->prev = span; + span->next = *head; + *head = span; +} + +//! Pop head span from double linked list +static void _rpmalloc_span_double_link_list_pop_head(span_t **head, + span_t *span) { + rpmalloc_assert(*head == span, "Linked list corrupted"); + span = *head; + *head = span->next; +} + +//! Remove a span from double linked list +static void _rpmalloc_span_double_link_list_remove(span_t **head, + span_t *span) { + rpmalloc_assert(*head, "Linked list corrupted"); + if (*head == span) { + *head = span->next; + } else { + span_t *next_span = span->next; + span_t *prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) + next_span->prev = prev_span; + } +} + +//////////// +/// +/// Span control +/// +////// + +static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span); + +static void _rpmalloc_heap_finalize(heap_t *heap); + +static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, + span_t *reserve, + size_t reserve_span_count); + +//! Declare the span to be a subspan and store distance from master span and +//! span count +static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master, + span_t *subspan, + size_t span_count) { + rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), + "Span master pointer and/or flag mismatch"); + if (subspan != master) { + subspan->flags = SPAN_FLAG_SUBSPAN; + subspan->offset_from_master = + (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> + _memory_span_size_shift); + subspan->align_offset = 0; + } + subspan->span_count = (uint32_t)span_count; +} + +//! Use reserved spans to fulfill a memory map request (reserve size must be +//! checked by caller) +static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap, + size_t span_count) { + // Update the heap span reserve + span_t *span = heap->span_reserve; + heap->span_reserve = + (span_t *)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= (uint32_t)span_count; + + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, + span_count); + if (span_count <= LARGE_CLASS_COUNT) + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); + + return span; +} + +//! Get the aligned number of spans to map in based on wanted count, configured +//! mapping granularity and the page size +static size_t _rpmalloc_span_align_count(size_t span_count) { + size_t request_count = (span_count > _memory_span_map_count) + ? span_count + : _memory_span_map_count; + if ((_memory_page_size > _memory_span_size) && + ((request_count * _memory_span_size) % _memory_page_size)) + request_count += + _memory_span_map_count - (request_count % _memory_span_map_count); + return request_count; +} + +//! Setup a newly mapped span +static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count, + size_t span_count, size_t align_offset) { + span->total_spans = (uint32_t)total_span_count; + span->span_count = (uint32_t)span_count; + span->align_offset = (uint32_t)align_offset; + span->flags = SPAN_FLAG_MASTER; + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); +} + +static void _rpmalloc_span_unmap(span_t *span); + +//! Map an aligned set of spans, taking configured mapping granularity and the +//! page size into account +static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap, + size_t span_count) { + // If we already have some, but not enough, reserved spans, release those to + // heap cache and map a new full set of spans. Otherwise we would waste memory + // if page size > span size (huge pages) + size_t aligned_span_count = _rpmalloc_span_align_count(span_count); + size_t align_offset = 0; + span_t *span = (span_t *)_rpmalloc_mmap( + aligned_span_count * _memory_span_size, &align_offset); + if (!span) + return 0; + _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); + _rpmalloc_stat_inc(&_master_spans); + if (span_count <= LARGE_CLASS_COUNT) + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); + if (aligned_span_count > span_count) { + span_t *reserved_spans = + (span_t *)pointer_offset(span, span_count * _memory_span_size); + size_t reserved_count = aligned_span_count - span_count; + if (heap->spans_reserved) { + _rpmalloc_span_mark_as_subspan_unless_master( + heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _rpmalloc_heap_cache_insert(heap, heap->span_reserve); + } + if (reserved_count > _memory_heap_reserve_count) { + // If huge pages or eager spam map count, the global reserve spin lock is + // held by caller, _rpmalloc_span_map + rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, + "Global spin lock not held as expected"); + size_t remain_count = reserved_count - _memory_heap_reserve_count; + reserved_count = _memory_heap_reserve_count; + span_t *remain_span = (span_t *)pointer_offset( + reserved_spans, reserved_count * _memory_span_size); + if (_memory_global_reserve) { + _rpmalloc_span_mark_as_subspan_unless_master( + _memory_global_reserve_master, _memory_global_reserve, + _memory_global_reserve_count); + _rpmalloc_span_unmap(_memory_global_reserve); + } + _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); + } + _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, + reserved_count); + } + return span; +} + +//! Map in memory pages for the given number of spans (or use previously +//! reserved pages) +static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) { + if (span_count <= heap->spans_reserved) + return _rpmalloc_span_map_from_reserve(heap, span_count); + span_t *span = 0; + int use_global_reserve = + (_memory_page_size > _memory_span_size) || + (_memory_span_map_count > _memory_heap_reserve_count); + if (use_global_reserve) { + // If huge pages, make sure only one thread maps more memory to avoid bloat + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (_memory_global_reserve_count >= span_count) { + size_t reserve_count = + (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); + if (_memory_global_reserve_count < reserve_count) + reserve_count = _memory_global_reserve_count; + span = _rpmalloc_global_get_reserved_spans(reserve_count); + if (span) { + if (reserve_count > span_count) { + span_t *reserved_span = (span_t *)pointer_offset( + span, span_count << _memory_span_size_shift); + _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, + reserved_span, + reserve_count - span_count); + } + // Already marked as subspan in _rpmalloc_global_get_reserved_spans + span->span_count = (uint32_t)span_count; + } + } + } + if (!span) + span = _rpmalloc_span_map_aligned_count(heap, span_count); + if (use_global_reserve) + atomic_store32_release(&_memory_global_lock, 0); + return span; +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no +//! partial unmappings) +static void _rpmalloc_span_unmap(span_t *span) { + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || + (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || + !(span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + + int is_master = !!(span->flags & SPAN_FLAG_MASTER); + span_t *master = + is_master ? span + : ((span_t *)pointer_offset( + span, -(intptr_t)((uintptr_t)span->offset_from_master * + _memory_span_size))); + rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + + size_t span_count = span->span_count; + if (!is_master) { + // Directly unmap subspans (unless huge pages, in which case we defer and + // unmap entire page range with master) + rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); + if (_memory_span_size >= _memory_page_size) + _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); + } else { + // Special double flag to denote an unmapped master + // It must be kept in memory since span header must be used + span->flags |= + SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; + _rpmalloc_stat_add(&_unmapped_master_spans, 1); + } + + if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { + // Everything unmapped, unmap the master span with release flag to unmap the + // entire range of the super span + rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && + !!(master->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + size_t unmap_count = master->span_count; + if (_memory_span_size < _memory_page_size) + unmap_count = master->total_spans; + _rpmalloc_stat_sub(&_master_spans, 1); + _rpmalloc_stat_sub(&_unmapped_master_spans, 1); + _rpmalloc_unmap(master, unmap_count * _memory_span_size, + master->align_offset, + (size_t)master->total_spans * _memory_span_size); + } +} + +//! Move the span (used for small or medium allocations) to the heap thread +//! cache +static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) { + rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); + rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, + "Invalid span size class"); + rpmalloc_assert(span->span_count == 1, "Invalid span count"); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + atomic_decr32(&heap->span_use[0].current); +#endif + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (!heap->finalize) { + _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); + _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); + if (heap->size_class[span->size_class].cache) + _rpmalloc_heap_cache_insert(heap, + heap->size_class[span->size_class].cache); + heap->size_class[span->size_class].cache = span; + } else { + _rpmalloc_span_unmap(span); + } +} + +//! Initialize a (partial) free list up to next system memory page, while +//! reserving the first block as allocated, returning number of blocks in list +static uint32_t free_list_partial_init(void **list, void **first_block, + void *page_start, void *block_start, + uint32_t block_count, + uint32_t block_size) { + rpmalloc_assert(block_count, "Internal failure"); + *first_block = block_start; + if (block_count > 1) { + void *free_block = pointer_offset(block_start, block_size); + void *block_end = + pointer_offset(block_start, (size_t)block_size * block_count); + // If block size is less than half a memory page, bound init to next memory + // page boundary + if (block_size < (_memory_page_size >> 1)) { + void *page_end = pointer_offset(page_start, _memory_page_size); + if (page_end < block_end) + block_end = page_end; + } + *list = free_block; + block_count = 2; + void *next_block = pointer_offset(free_block, block_size); + while (next_block < block_end) { + *((void **)free_block) = next_block; + free_block = next_block; + ++block_count; + next_block = pointer_offset(next_block, block_size); + } + *((void **)free_block) = 0; + } else { + *list = 0; + } + return block_count; +} + +//! Initialize an unused span (from cache or mapped) to be new active span, +//! putting the initial free list in heap class free list +static void *_rpmalloc_span_initialize_new(heap_t *heap, + heap_size_class_t *heap_size_class, + span_t *span, uint32_t class_idx) { + rpmalloc_assert(span->span_count == 1, "Internal failure"); + size_class_t *size_class = _memory_size_class + class_idx; + span->size_class = class_idx; + span->heap = heap; + span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; + span->block_size = size_class->block_size; + span->block_count = size_class->block_count; + span->free_list = 0; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); + + // Setup free list. Only initialize one system page worth of free blocks in + // list + void *block; + span->free_list_limit = + free_list_partial_init(&heap_size_class->free_list, &block, span, + pointer_offset(span, SPAN_HEADER_SIZE), + size_class->block_count, size_class->block_size); + // Link span as partial if there remains blocks to be initialized as free + // list, or full if fully initialized + if (span->free_list_limit < span->block_count) { + _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); + span->used_count = span->free_list_limit; + } else { +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + span->used_count = span->block_count; + } + return block; +} + +static void _rpmalloc_span_extract_free_list_deferred(span_t *span) { + // We need acquire semantics on the CAS operation since we are interested in + // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for + // further comments on this dependency + do { + span->free_list = + atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (span->free_list == INVALID_POINTER); + span->used_count -= span->list_size; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); +} + +static int _rpmalloc_span_is_fully_utilized(span_t *span) { + rpmalloc_assert(span->free_list_limit <= span->block_count, + "Span free list corrupted"); + return !span->free_list && (span->free_list_limit >= span->block_count); +} + +static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span, + span_t **list_head) { + void *free_list = heap->size_class[iclass].free_list; + span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask); + if (span == class_span) { + // Adopt the heap class free list back into the span free list + void *block = span->free_list; + void *last_block = 0; + while (block) { + last_block = block; + block = *((void **)block); + } + uint32_t free_count = 0; + block = free_list; + while (block) { + ++free_count; + block = *((void **)block); + } + if (last_block) { + *((void **)last_block) = free_list; + } else { + span->free_list = free_list; + } + heap->size_class[iclass].free_list = 0; + span->used_count -= free_count; + } + // If this assert triggers you have memory leaks + rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); + if (span->list_size == span->used_count) { + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); + // This function only used for spans in double linked lists + if (list_head) + _rpmalloc_span_double_link_list_remove(list_head, span); + _rpmalloc_span_unmap(span); + return 1; + } + return 0; +} + +//////////// +/// +/// Global cache +/// +////// + +#if ENABLE_GLOBAL_CACHE + +//! Finalize a global cache +static void _rpmalloc_global_cache_finalize(global_cache_t *cache) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + for (size_t ispan = 0; ispan < cache->count; ++ispan) + _rpmalloc_span_unmap(cache->span[ispan]); + cache->count = 0; + + while (cache->overflow) { + span_t *span = cache->overflow; + cache->overflow = span->next; + _rpmalloc_span_unmap(span); + } + + atomic_store32_release(&cache->lock, 0); +} + +static void _rpmalloc_global_cache_insert_spans(span_t **span, + size_t span_count, + size_t count) { + const size_t cache_limit = + (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE + : GLOBAL_CACHE_MULTIPLIER * + (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + + global_cache_t *cache = &_memory_span_cache[span_count - 1]; + + size_t insert_count = count; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->insert_count += count; +#endif + if ((cache->count + insert_count) > cache_limit) + insert_count = cache_limit - cache->count; + + memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count); + cache->count += (uint32_t)insert_count; + +#if ENABLE_UNLIMITED_CACHE + while (insert_count < count) { +#else + // Enable unlimited cache if huge pages, or we will leak since it is unlikely + // that an entire huge page will be unmapped, and we're unable to partially + // decommit a huge page + while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { +#endif + span_t *current_span = span[insert_count++]; + current_span->next = cache->overflow; + cache->overflow = current_span; + } + atomic_store32_release(&cache->lock, 0); + + span_t *keep = 0; + for (size_t ispan = insert_count; ispan < count; ++ispan) { + span_t *current_span = span[ispan]; + // Keep master spans that has remaining subspans to avoid dangling them + if ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) > + (int32_t)current_span->span_count)) { + current_span->next = keep; + keep = current_span; + } else { + _rpmalloc_span_unmap(current_span); + } + } + + if (keep) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + size_t islot = 0; + while (keep) { + for (; islot < cache->count; ++islot) { + span_t *current_span = cache->span[islot]; + if (!(current_span->flags & SPAN_FLAG_MASTER) || + ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) <= + (int32_t)current_span->span_count))) { + _rpmalloc_span_unmap(current_span); + cache->span[islot] = keep; + break; + } + } + if (islot == cache->count) + break; + keep = keep->next; + } + + if (keep) { + span_t *tail = keep; + while (tail->next) + tail = tail->next; + tail->next = cache->overflow; + cache->overflow = keep; + } + + atomic_store32_release(&cache->lock, 0); + } +} + +static size_t _rpmalloc_global_cache_extract_spans(span_t **span, + size_t span_count, + size_t count) { + global_cache_t *cache = &_memory_span_cache[span_count - 1]; + + size_t extract_count = 0; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->extract_count += count; +#endif + size_t want = count - extract_count; + if (want > cache->count) + want = cache->count; + + memcpy(span + extract_count, cache->span + (cache->count - want), + sizeof(span_t *) * want); + cache->count -= (uint32_t)want; + extract_count += want; + + while ((extract_count < count) && cache->overflow) { + span_t *current_span = cache->overflow; + span[extract_count++] = current_span; + cache->overflow = current_span->next; + } + +#if ENABLE_ASSERTS + for (size_t ispan = 0; ispan < extract_count; ++ispan) { + rpmalloc_assert(span[ispan]->span_count == span_count, + "Global cache span count mismatch"); + } +#endif + + atomic_store32_release(&cache->lock, 0); + + return extract_count; +} + +#endif + +//////////// +/// +/// Heap control +/// +////// + +static void _rpmalloc_deallocate_huge(span_t *); + +//! Store the given spans as reserve in the given heap +static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master, + span_t *reserve, + size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = (uint32_t)reserve_span_count; +} + +//! Adopt the deferred span cache list, optionally extracting the first single +//! span for immediate re-use +static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap, + span_t **single_span) { + span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire( + &heap->span_free_deferred, 0)); + while (span) { + span_t *next_span = (span_t *)span->free_list; + rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; + _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], + span); +#endif + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } else { + if (span->size_class == SIZE_CLASS_HUGE) { + _rpmalloc_deallocate_huge(span); + } else { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, + "Span size class invalid"); + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); +#endif + uint32_t idx = span->span_count - 1; + _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); + _rpmalloc_stat_dec(&heap->span_use[idx].current); + if (!idx && single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } + } + span = next_span; + } +} + +static void _rpmalloc_heap_unmap(heap_t *heap) { + if (!heap->master_heap) { + if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { + span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask); + _rpmalloc_span_unmap(span); + } + } else { + if (atomic_decr32(&heap->master_heap->child_count) == 0) { + _rpmalloc_heap_unmap(heap->master_heap); + } + } +} + +static void _rpmalloc_heap_global_finalize(heap_t *heap) { + if (heap->finalize++ > 1) { + --heap->finalize; + return; + } + + _rpmalloc_heap_finalize(heap); + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + + if (heap->full_span_count) { + --heap->finalize; + return; + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].free_list || + heap->size_class[iclass].partial_span) { + --heap->finalize; + return; + } + } + // Heap is now completely free, unmap and remove from heap list + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap_t *list_heap = _memory_heaps[list_idx]; + if (list_heap == heap) { + _memory_heaps[list_idx] = heap->next_heap; + } else { + while (list_heap->next_heap != heap) + list_heap = list_heap->next_heap; + list_heap->next_heap = heap->next_heap; + } + + _rpmalloc_heap_unmap(heap); +} + +//! Insert a single span into thread heap cache, releasing to global cache if +//! overflow +static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) { + if (UNEXPECTED(heap->finalize != 0)) { + _rpmalloc_span_unmap(span); + _rpmalloc_heap_global_finalize(heap); + return; + } +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); + if (span_count == 1) { + span_cache_t *span_cache = &heap->span_cache; + span_cache->span[span_cache->count++] = span; + if (span_cache->count == MAX_THREAD_SPAN_CACHE) { + const size_t remain_count = + MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, + THREAD_SPAN_CACHE_TRANSFER); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, + span_count, + THREAD_SPAN_CACHE_TRANSFER); +#else + for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } else { + size_t cache_idx = span_count - 2; + span_large_cache_t *span_cache = heap->span_large_cache + cache_idx; + span_cache->span[span_cache->count++] = span; + const size_t cache_limit = + (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + if (span_cache->count == cache_limit) { + const size_t transfer_limit = 2 + (cache_limit >> 2); + const size_t transfer_count = + (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit + ? THREAD_SPAN_LARGE_CACHE_TRANSFER + : transfer_limit); + const size_t remain_count = cache_limit - transfer_count; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + transfer_count * span_count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, + transfer_count); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, + span_count, transfer_count); +#else + for (size_t ispan = 0; ispan < transfer_count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } +#else + (void)sizeof(heap); + _rpmalloc_span_unmap(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap, + size_t span_count) { + span_t *span = 0; +#if ENABLE_THREAD_CACHE + span_cache_t *span_cache; + if (span_count == 1) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); + if (span_cache->count) { + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); + return span_cache->span[--span_cache->count]; + } +#endif + return span; +} + +static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap, + size_t span_count) { + span_t *span = 0; + if (span_count == 1) { + _rpmalloc_heap_cache_adopt_deferred(heap, &span); + } else { + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + } + return span; +} + +static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap, + size_t span_count) { + if (heap->spans_reserved >= span_count) + return _rpmalloc_span_map(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap, + size_t span_count) { +#if ENABLE_GLOBAL_CACHE +#if ENABLE_THREAD_CACHE + span_cache_t *span_cache; + size_t wanted_count; + if (span_count == 1) { + span_cache = &heap->span_cache; + wanted_count = THREAD_SPAN_CACHE_TRANSFER; + } else { + span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2)); + wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; + } + span_cache->count = _rpmalloc_global_cache_extract_spans( + span_cache->span, span_count, wanted_count); + if (span_cache->count) { + _rpmalloc_stat_add64(&heap->global_to_thread, + span_count * span_cache->count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, + span_cache->count); + return span_cache->span[--span_cache->count]; + } +#else + span_t *span = 0; + size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); + if (count) { + _rpmalloc_stat_add64(&heap->global_to_thread, + span_count * count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, + count); + return span; + } +#endif +#endif + (void)sizeof(heap); + (void)sizeof(span_count); + return 0; +} + +static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count, + uint32_t class_idx) { + (void)sizeof(heap); + (void)sizeof(span_count); + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = + (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) + atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); + _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, + heap->size_class_use[class_idx].spans_peak); +#endif +} + +//! Get a span from one of the cache levels (thread cache, reserved, global +//! cache) or fallback to mapping more memory +static span_t * +_rpmalloc_heap_extract_new_span(heap_t *heap, + heap_size_class_t *heap_size_class, + size_t span_count, uint32_t class_idx) { + span_t *span; +#if ENABLE_THREAD_CACHE + if (heap_size_class && heap_size_class->cache) { + span = heap_size_class->cache; + heap_size_class->cache = + (heap->span_cache.count + ? heap->span_cache.span[--heap->span_cache.count] + : 0); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } +#endif + (void)sizeof(class_idx); + // Allow 50% overhead to increase cache hits + size_t base_span_count = span_count; + size_t limit_span_count = + (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; + if (limit_span_count > LARGE_CLASS_COUNT) + limit_span_count = LARGE_CLASS_COUNT; + do { + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + ++span_count; + } while (span_count <= limit_span_count); + // Final fallback, map in more virtual memory + span = _rpmalloc_span_map(heap, base_span_count); + _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); + return span; +} + +static void _rpmalloc_heap_initialize(heap_t *heap) { + _rpmalloc_memset_const(heap, 0, sizeof(heap_t)); + // Get a new heap ID + heap->id = 1 + atomic_incr32(&_memory_heap_id); + + // Link in heap in heap ID map + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap->next_heap = _memory_heaps[list_idx]; + _memory_heaps[list_idx] = heap; +} + +static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) { + heap->owner_thread = (uintptr_t)-1; +#if RPMALLOC_FIRST_CLASS_HEAPS + heap_t **heap_list = + (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); +#else + (void)sizeof(first_class); + heap_t **heap_list = &_memory_orphan_heaps; +#endif + heap->next_orphan = *heap_list; + *heap_list = heap; +} + +//! Allocate a new heap from newly mapped memory pages +static heap_t *_rpmalloc_heap_allocate_new(void) { + // Map in pages for a 16 heaps. If page size is greater than required size for + // this, map a page and use first part for heaps and remaining part for spans + // for allocations. Adds a lot of complexity, but saves a lot of memory on + // systems where page size > 64 spans (4MiB) + size_t heap_size = sizeof(heap_t); + size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); + size_t request_heap_count = 16; + size_t heap_span_count = ((aligned_heap_size * request_heap_count) + + sizeof(span_t) + _memory_span_size - 1) / + _memory_span_size; + size_t block_size = _memory_span_size * heap_span_count; + size_t span_count = heap_span_count; + span_t *span = 0; + // If there are global reserved spans, use these first + if (_memory_global_reserve_count >= heap_span_count) { + span = _rpmalloc_global_get_reserved_spans(heap_span_count); + } + if (!span) { + if (_memory_page_size > block_size) { + span_count = _memory_page_size / _memory_span_size; + block_size = _memory_page_size; + // If using huge pages, make sure to grab enough heaps to avoid + // reallocating a huge page just to serve new heaps + size_t possible_heap_count = + (block_size - sizeof(span_t)) / aligned_heap_size; + if (possible_heap_count >= (request_heap_count * 16)) + request_heap_count *= 16; + else if (possible_heap_count < request_heap_count) + request_heap_count = possible_heap_count; + heap_span_count = ((aligned_heap_size * request_heap_count) + + sizeof(span_t) + _memory_span_size - 1) / + _memory_span_size; + } + + size_t align_offset = 0; + span = (span_t *)_rpmalloc_mmap(block_size, &align_offset); + if (!span) + return 0; + + // Master span will contain the heaps + _rpmalloc_stat_inc(&_master_spans); + _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); + } + + size_t remain_size = _memory_span_size - sizeof(span_t); + heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t)); + _rpmalloc_heap_initialize(heap); + + // Put extra heaps as orphans + size_t num_heaps = remain_size / aligned_heap_size; + if (num_heaps < request_heap_count) + num_heaps = request_heap_count; + atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); + heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size); + while (num_heaps > 1) { + _rpmalloc_heap_initialize(extra_heap); + extra_heap->master_heap = heap; + _rpmalloc_heap_orphan(extra_heap, 1); + extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size); + --num_heaps; + } + + if (span_count > heap_span_count) { + // Cap reserved spans + size_t remain_count = span_count - heap_span_count; + size_t reserve_count = + (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count + : remain_count); + span_t *remain_span = + (span_t *)pointer_offset(span, heap_span_count * _memory_span_size); + _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); + + if (remain_count > reserve_count) { + // Set to global reserved spans + remain_span = (span_t *)pointer_offset(remain_span, + reserve_count * _memory_span_size); + reserve_count = remain_count - reserve_count; + _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); + } + } + + return heap; +} + +static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) { + heap_t *heap = *heap_list; + *heap_list = (heap ? heap->next_orphan : 0); + return heap; +} + +//! Allocate a new heap, potentially reusing a previously orphaned heap +static heap_t *_rpmalloc_heap_allocate(int first_class) { + heap_t *heap = 0; + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (first_class == 0) + heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); +#if RPMALLOC_FIRST_CLASS_HEAPS + if (!heap) + heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); +#endif + if (!heap) + heap = _rpmalloc_heap_allocate_new(); + atomic_store32_release(&_memory_global_lock, 0); + if (heap) + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + return heap; +} + +static void _rpmalloc_heap_release(void *heapptr, int first_class, + int release_cache) { + heap_t *heap = (heap_t *)heapptr; + if (!heap) + return; + // Release thread cache spans back to global cache + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + if (release_cache || heap->finalize) { +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + if (heap->finalize) { + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + } else { + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * + (iclass + 1) * + _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, + span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, + span_cache->count); + } +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + } + + if (get_thread_heap_raw() == heap) + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, + "Still active heaps during finalization"); +#endif + + // If we are forcibly terminating with _exit the state of the + // lock atomic is unknown and it's best to just go ahead and exit + if (get_thread_id() != _rpmalloc_main_thread_id) { + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + } + _rpmalloc_heap_orphan(heap, first_class); + atomic_store32_release(&_memory_global_lock, 0); +} + +static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) { + _rpmalloc_heap_release(heapptr, 0, release_cache); +} + +static void _rpmalloc_heap_release_raw_fc(void *heapptr) { + _rpmalloc_heap_release_raw(heapptr, 1); +} + +static void _rpmalloc_heap_finalize(heap_t *heap) { + if (heap->spans_reserved) { + span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved); + _rpmalloc_span_unmap(span); + heap->spans_reserved = 0; + } + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].cache) + _rpmalloc_span_unmap(heap->size_class[iclass].cache); + heap->size_class[iclass].cache = 0; + span_t *span = heap->size_class[iclass].partial_span; + while (span) { + span_t *next = span->next; + _rpmalloc_span_finalize(heap, iclass, span, + &heap->size_class[iclass].partial_span); + span = next; + } + // If class still has a free list it must be a full span + if (heap->size_class[iclass].free_list) { + span_t *class_span = + (span_t *)((uintptr_t)heap->size_class[iclass].free_list & + _memory_span_mask); + span_t **list = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + list = &heap->full_span[iclass]; +#endif + --heap->full_span_count; + if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { + if (list) + _rpmalloc_span_double_link_list_remove(list, class_span); + _rpmalloc_span_double_link_list_add( + &heap->size_class[iclass].partial_span, class_span); + } + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), + "Heaps still active during finalization"); +} + +//////////// +/// +/// Allocation entry points +/// +////// + +//! Pop first block from a free list +static void *free_list_pop(void **list) { + void *block = *list; + *list = *((void **)block); + return block; +} + +//! Allocate a small/medium sized memory block from the given heap +static void *_rpmalloc_allocate_from_heap_fallback( + heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) { + span_t *span = heap_size_class->partial_span; + rpmalloc_assume(heap != 0); + if (EXPECTED(span != 0)) { + rpmalloc_assert(span->block_count == + _memory_size_class[span->size_class].block_count, + "Span block count corrupted"); + rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), + "Internal failure"); + void *block; + if (span->free_list) { + // Span local free list is not empty, swap to size class free list + block = free_list_pop(&span->free_list); + heap_size_class->free_list = span->free_list; + span->free_list = 0; + } else { + // If the span did not fully initialize free list, link up another page + // worth of blocks + void *block_start = pointer_offset( + span, SPAN_HEADER_SIZE + + ((size_t)span->free_list_limit * span->block_size)); + span->free_list_limit += free_list_partial_init( + &heap_size_class->free_list, &block, + (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)), + block_start, span->block_count - span->free_list_limit, + span->block_size); + } + rpmalloc_assert(span->free_list_limit <= span->block_count, + "Span block count corrupted"); + span->used_count = span->free_list_limit; + + // Swap in deferred free list if present + if (atomic_load_ptr(&span->free_list_deferred)) + _rpmalloc_span_extract_free_list_deferred(span); + + // If span is still not fully utilized keep it in partial list and early + // return block + if (!_rpmalloc_span_is_fully_utilized(span)) + return block; + + // The span is fully utilized, unlink from partial list and add to fully + // utilized list + _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, + span); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + return block; + } + + // Find a span in one of the cache levels + span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); + if (EXPECTED(span != 0)) { + // Mark span as owned by this heap and set base data, return first block + return _rpmalloc_span_initialize_new(heap, heap_size_class, span, + class_idx); + } + + return 0; +} + +//! Allocate a small sized memory block from the given heap +static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Small sizes have unique size classes + const uint32_t class_idx = + (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); + heap_size_class_t *heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, + class_idx); +} + +//! Allocate a medium sized memory block from the given heap +static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Calculate the size class index and do a dependent lookup of the final class + // index (in case of merged classes) + const uint32_t base_idx = + (uint32_t)(SMALL_CLASS_COUNT + + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); + const uint32_t class_idx = _memory_size_class[base_idx].class_idx; + heap_size_class_t *heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, + class_idx); +} + +//! Allocate a large sized memory block from the given heap +static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + // Calculate number of needed max sized spans (including header) + // Since this function is never called if size > LARGE_SIZE_LIMIT + // the span_count is guaranteed to be <= LARGE_CLASS_COUNT + size += SPAN_HEADER_SIZE; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + + // Find a span in one of the cache levels + span_t *span = + _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); + if (!span) + return span; + + // Mark span as owned by this heap and set base data + rpmalloc_assert(span->span_count >= span_count, "Internal failure"); + span->size_class = SIZE_CLASS_LARGE; + span->heap = heap; + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a huge block by mapping memory pages directly +static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + size += SPAN_HEADER_SIZE; + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) + ++num_pages; + size_t align_offset = 0; + span_t *span = + (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); + if (!span) + return span; + + // Store page count in span_count + span->size_class = SIZE_CLASS_HUGE; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a block of the given size +static void *_rpmalloc_allocate(heap_t *heap, size_t size) { + _rpmalloc_stat_add64(&_allocation_counter, 1); + if (EXPECTED(size <= SMALL_SIZE_LIMIT)) + return _rpmalloc_allocate_small(heap, size); + else if (size <= _memory_medium_size_limit) + return _rpmalloc_allocate_medium(heap, size); + else if (size <= LARGE_SIZE_LIMIT) + return _rpmalloc_allocate_large(heap, size); + return _rpmalloc_allocate_huge(heap, size); +} + +static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment, + size_t size) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_allocate(heap, size); + +#if ENABLE_VALIDATE_ARGS + if ((size + alignment) < size) { + errno = EINVAL; + return 0; + } + if (alignment & (alignment - 1)) { + errno = EINVAL; + return 0; + } +#endif + + if ((alignment <= SPAN_HEADER_SIZE) && + ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) { + // If alignment is less or equal to span header size (which is power of + // two), and size aligned to span header size multiples is less than size + + // alignment, then use natural alignment of blocks to provide alignment + size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & + ~(uintptr_t)(SPAN_HEADER_SIZE - 1) + : SPAN_HEADER_SIZE; + rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), + "Failed alignment calculation"); + if (multiple_size <= (size + alignment)) + return _rpmalloc_allocate(heap, multiple_size); + } + + void *ptr = 0; + size_t align_mask = alignment - 1; + if (alignment <= _memory_page_size) { + ptr = _rpmalloc_allocate(heap, size + alignment); + if ((uintptr_t)ptr & align_mask) { + ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + // Mark as having aligned blocks + span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } + return ptr; + } + + // Fallback to mapping new pages for this request. Since pointers passed + // to rpfree must be able to reach the start of the span by bitmasking of + // the address with the span size, the returned aligned pointer from this + // function must be with a span size of the start of the mapped area. + // In worst case this requires us to loop and map pages until we get a + // suitable memory address. It also means we can never align to span size + // or greater, since the span header will push alignment more than one + // span size away from span start (thus causing pointer mask to give us + // an invalid span start on free) + if (alignment & align_mask) { + errno = EINVAL; + return 0; + } + if (alignment >= _memory_span_size) { + errno = EINVAL; + return 0; + } + + size_t extra_pages = alignment / _memory_page_size; + + // Since each span has a header, we will at least need one extra memory page + size_t num_pages = 1 + (size / _memory_page_size); + if (size & (_memory_page_size - 1)) + ++num_pages; + + if (extra_pages > num_pages) + num_pages = 1 + extra_pages; + + size_t original_pages = num_pages; + size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; + if (limit_pages < (original_pages * 2)) + limit_pages = original_pages * 2; + + size_t mapped_size, align_offset; + span_t *span; + +retry: + align_offset = 0; + mapped_size = num_pages * _memory_page_size; + + span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset); + if (!span) { + errno = ENOMEM; + return 0; + } + ptr = pointer_offset(span, SPAN_HEADER_SIZE); + + if ((uintptr_t)ptr & align_mask) + ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + + if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || + (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || + (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { + _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); + ++num_pages; + if (num_pages > limit_pages) { + errno = EINVAL; + return 0; + } + goto retry; + } + + // Store page count in span_count + span->size_class = SIZE_CLASS_HUGE; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + _rpmalloc_stat_add64(&_allocation_counter, 1); + + return ptr; +} + +//////////// +/// +/// Deallocation entry points +/// +////// + +//! Deallocate the given small/medium memory block in the current thread local +//! heap +static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span, + void *block) { + heap_t *heap = span->heap; + rpmalloc_assert(heap->owner_thread == get_thread_id() || + !heap->owner_thread || heap->finalize, + "Internal failure"); + // Add block to free list + if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { + span->used_count = span->block_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], + span); +#endif + _rpmalloc_span_double_link_list_add( + &heap->size_class[span->size_class].partial_span, span); + --heap->full_span_count; + } + *((void **)block) = span->free_list; + --span->used_count; + span->free_list = block; + if (UNEXPECTED(span->used_count == span->list_size)) { + // If there are no used blocks it is guaranteed that no other external + // thread is accessing the span + if (span->used_count) { + // Make sure we have synchronized the deferred list and list size by using + // acquire semantics and guarantee that no external thread is accessing + // span concurrently + void *free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, + INVALID_POINTER); + } while (free_list == INVALID_POINTER); + atomic_store_ptr_release(&span->free_list_deferred, free_list); + } + _rpmalloc_span_double_link_list_remove( + &heap->size_class[span->size_class].partial_span, span); + _rpmalloc_span_release_to_cache(heap, span); + } +} + +static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) { + if (span->size_class != SIZE_CLASS_HUGE) + _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); + // This list does not need ABA protection, no mutable side state + do { + span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred); + } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); +} + +//! Put the block in the deferred free list of the owning span +static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span, + void *block) { + // The memory ordering here is a bit tricky, to avoid having to ABA protect + // the deferred free list to avoid desynchronization of list and list size + // we need to have acquire semantics on successful CAS of the pointer to + // guarantee the list_size variable validity + release semantics on pointer + // store + void *free_list; + do { + free_list = + atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + *((void **)block) = free_list; + uint32_t free_count = ++span->list_size; + int all_deferred_free = (free_count == span->block_count); + atomic_store_ptr_release(&span->free_list_deferred, block); + if (all_deferred_free) { + // Span was completely freed by this block. Due to the INVALID_POINTER spin + // lock no other thread can reach this state simultaneously on this span. + // Safe to move to owner heap deferred cache + _rpmalloc_deallocate_defer_free_span(span->heap, span); + } +} + +static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) { + _rpmalloc_stat_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + // Realign pointer to block start + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + // Check if block belongs to this heap or if deallocation should be deferred +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (!defer) + _rpmalloc_deallocate_direct_small_or_medium(span, p); + else + _rpmalloc_deallocate_defer_small_or_medium(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void _rpmalloc_deallocate_large(span_t *span) { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || + !(span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || + (span->flags & SPAN_FLAG_SUBSPAN), + "Span flag corrupted"); + // We must always defer (unless finalizing) if from another heap since we + // cannot touch the list or counters of another heap +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + // Decrease counter + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + heap_t *heap = span->heap; + rpmalloc_assert(heap, "No thread heap"); +#if ENABLE_THREAD_CACHE + const int set_as_reserved = + ((span->span_count > 1) && (heap->span_cache.count == 0) && + !heap->finalize && !heap->spans_reserved); +#else + const int set_as_reserved = + ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); +#endif + if (set_as_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { // SPAN_FLAG_SUBSPAN + span_t *master = (span_t *)pointer_offset( + span, + -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); + heap->span_reserve_master = master; + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + rpmalloc_assert(atomic_load32(&master->remaining_spans) >= + (int32_t)span->span_count, + "Master span count corrupted"); + } + _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); + } else { + // Insert into cache list + _rpmalloc_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void _rpmalloc_deallocate_huge(span_t *span) { + rpmalloc_assert(span->heap, "No span heap"); +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = + (span->heap->owner_thread && + (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = + ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif + + // Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, + num_pages * _memory_page_size); + _rpmalloc_stat_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void _rpmalloc_deallocate(void *p) { + _rpmalloc_stat_add64(&_deallocation_counter, 1); + // Grab the span (always at start of span, using span alignment) + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _rpmalloc_deallocate_small_or_medium(span, p); + else if (span->size_class == SIZE_CLASS_LARGE) + _rpmalloc_deallocate_large(span); + else + _rpmalloc_deallocate_huge(span); +} + +//////////// +/// +/// Reallocation entry points +/// +////// + +static size_t _rpmalloc_usable_size(void *p); + +//! Reallocate the given block to the given size +static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size, + size_t oldsize, unsigned int flags) { + if (p) { + // Grab the span using guaranteed span alignment + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + // Small/medium sized block + rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void *block = + pointer_offset(blocks_start, (size_t)block_idx * span->block_size); + if (!oldsize) + oldsize = + (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); + if ((size_t)span->block_size >= size) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else if (span->size_class == SIZE_CLASS_LARGE) { + // Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + void *block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - + (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + // Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + // Page count is stored in span_count + size_t current_pages = span->span_count; + void *block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - + (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + // Still fits in block, never mind trying to save memory, but preserve + // data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + if (!!(flags & RPMALLOC_GROW_OR_FAIL)) + return 0; + + // Size is greater than block size, need to allocate a new block and + // deallocate the old Avoid hysteresis by overallocating if increase is small + // (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = + (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void *block = _rpmalloc_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _rpmalloc_deallocate(p); + } + + return block; +} + +static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr, + size_t alignment, size_t size, + size_t oldsize, unsigned int flags) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); + + int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); + size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); + if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { + if (no_alloc || (size >= (usablesize / 2))) + return ptr; + } + // Aligned alloc marks span as having aligned blocks + void *block = + (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); + if (EXPECTED(block != 0)) { + if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { + if (!oldsize) + oldsize = usablesize; + memcpy(block, ptr, oldsize < size ? oldsize : size); + } + _rpmalloc_deallocate(ptr); + } + return block; +} + +//////////// +/// +/// Initialization, finalization and utility +/// +////// + +//! Get the usable size of the given block +static size_t _rpmalloc_usable_size(void *p) { + // Grab the span using guaranteed span alignment + span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask); + if (span->size_class < SIZE_CLASS_COUNT) { + // Small/medium block + void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - + ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + if (span->size_class == SIZE_CLASS_LARGE) { + // Large block + size_t current_spans = span->span_count; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + // Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void _rpmalloc_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + // Check if previous size classes can be merged + if (iclass >= SMALL_CLASS_COUNT) { + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + // A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == + _memory_size_class[iclass].block_count) + _rpmalloc_memcpy_const(_memory_size_class + prevclass, + _memory_size_class + iclass, + sizeof(_memory_size_class[iclass])); + else + break; + } + } +} + +//! Initialize the allocator and setup global data +extern inline int rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + return rpmalloc_initialize_config(0); +} + +int rpmalloc_initialize_config(const rpmalloc_config_t *config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + else + _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _rpmalloc_mmap_os; + _memory_config.memory_unmap = _rpmalloc_unmap_os; + } + +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_map_granularity = system_info.dwAllocationGranularity; +#else + _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); +#endif + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + _memory_page_size = system_info.dwPageSize; +#else + _memory_page_size = _memory_map_granularity; + if (_memory_config.enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE *meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && + rc == 1) { + static size_t defsize = 2 * 1024 * 1024; + int nsize = 0; + size_t sizes[4] = {0}; + _memory_huge_pages = 1; + _memory_page_size = defsize; + if ((nsize = getpagesizes(sizes, 4)) >= 2) { + nsize--; + for (size_t csize = sizes[nsize]; nsize >= 0 && csize; + --nsize, csize = sizes[nsize]) { + //! Unlikely, but as a precaution.. + rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024), + "Invalid page size"); + if (defsize < csize) { + _memory_page_size = csize; + break; + } + } + } + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) || defined(__NetBSD__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + } else { + if (_memory_config.enable_huge_pages) + _memory_huge_pages = 1; + } + +#if PLATFORM_WINDOWS + if (_memory_config.enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), + TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + if (GetLastError() == ERROR_SUCCESS) + _memory_huge_pages = 1; + } + } + CloseHandle(token); + } + if (_memory_huge_pages) { + if (large_page_minimum > _memory_page_size) + _memory_page_size = large_page_minimum; + if (large_page_minimum > _memory_map_granularity) + _memory_map_granularity = large_page_minimum; + } + } +#endif + + size_t min_span_size = 256; + size_t max_page_size; +#if UINTPTR_MAX > 0xFFFFFFFF + max_page_size = 4096ULL * 1024ULL * 1024ULL; +#else + max_page_size = 4 * 1024 * 1024; +#endif + if (_memory_page_size < min_span_size) + _memory_page_size = min_span_size; + if (_memory_page_size > max_page_size) + _memory_page_size = max_page_size; + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + if (!_memory_config.span_size) { + _memory_span_size = _memory_default_span_size; + _memory_span_size_shift = _memory_default_span_size_shift; + _memory_span_mask = _memory_default_span_mask; + } else { + size_t span_size = _memory_config.span_size; + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + } +#endif + + _memory_span_map_count = + (_memory_config.span_map_count ? _memory_config.span_map_count + : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && + ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) + ? DEFAULT_SPAN_MAP_COUNT + : _memory_span_map_count; + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || \ + defined(__TINYC__) + if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) + return -1; +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&_rpmalloc_thread_destructor); +#endif + + // Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _rpmalloc_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(iclass); + } + // At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) { + _memory_medium_size_limit = + SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY); + break; + } + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + _memory_orphan_heaps = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + _memory_first_class_orphan_heaps = 0; +#endif +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_master_spans, 0); + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + memset(_memory_heaps, 0, sizeof(_memory_heaps)); + atomic_store32_release(&_memory_global_lock, 0); + + rpmalloc_linker_reference(); + + // Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void rpmalloc_finalize(void) { + rpmalloc_thread_finalize(1); + // rpmalloc_dump_statistics(stdout); + + if (_memory_global_reserve) { + atomic_add32(&_memory_global_reserve_master->remaining_spans, + -(int32_t)_memory_global_reserve_count); + _memory_global_reserve_master = 0; + _memory_global_reserve_count = 0; + _memory_global_reserve = 0; + } + atomic_store32_release(&_memory_global_lock, 0); + + // Free all thread caches and fully free spans + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t *heap = _memory_heaps[list_idx]; + while (heap) { + heap_t *next_heap = heap->next_heap; + heap->finalize = 1; + _rpmalloc_heap_global_finalize(heap); + heap = next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + // Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); +#endif + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); + fls_key = 0; +#endif +#if ENABLE_STATISTICS + // If you hit these asserts you probably have memory leaks (perhaps global + // scope data doing dynamic allocations) or double frees in your code + rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); + rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, + "Memory leak detected"); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +extern inline void rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t *heap = _rpmalloc_heap_allocate(0); + if (heap) { + _rpmalloc_stat_inc(&_memory_active_heaps); + set_thread_heap(heap); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +void rpmalloc_thread_finalize(int release_caches) { + heap_t *heap = get_thread_heap_raw(); + if (heap) + _rpmalloc_heap_release_raw(heap, release_caches); + set_thread_heap(0); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, 0); +#endif +} + +int rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; } + +// Extern interface + +extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_allocate(heap, size); +} + +extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); } + +extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t *heap = get_thread_heap(); + void *block = _rpmalloc_allocate(heap, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_reallocate(heap, ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment, + size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + heap_t *heap = get_thread_heap(); + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, + flags); +} + +extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) { + heap_t *heap = get_thread_heap(); + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpaligned_calloc(size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void *block = rpaligned_alloc(alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment, + size_t size) { + return rpaligned_alloc(alignment, size); +} + +extern inline int rpposix_memalign(void **memptr, size_t alignment, + size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +extern inline size_t rpmalloc_usable_size(void *ptr) { + return (ptr ? _rpmalloc_usable_size(ptr) : 0); +} + +extern inline void rpmalloc_thread_collect(void) {} + +void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t *heap = get_thread_heap_raw(); + if (!heap) + return; + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + size_class_t *size_class = _memory_size_class + iclass; + span_t *span = heap->size_class[iclass].partial_span; + while (span) { + size_t free_count = span->list_size; + size_t block_count = size_class->block_count; + if (span->free_list_limit < block_count) + block_count = span->free_list_limit; + free_count += (block_count - span->used_count); + stats->sizecache += free_count * size_class->block_size; + span = span->next; + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size; + } +#endif + + span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred); + while (deferred) { + if (deferred->size_class != SIZE_CLASS_HUGE) + stats->spancache += (size_t)deferred->span_count * _memory_span_size; + deferred = (span_t *)deferred->free_list; + } + +#if ENABLE_STATISTICS + stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); + stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->span_use[iclass].current = + (size_t)atomic_load32(&heap->span_use[iclass].current); + stats->span_use[iclass].peak = + (size_t)atomic_load32(&heap->span_use[iclass].high); + stats->span_use[iclass].to_global = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); + stats->span_use[iclass].from_global = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); + stats->span_use[iclass].to_cache = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); + stats->span_use[iclass].from_cache = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); + stats->span_use[iclass].to_reserved = + (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); + stats->span_use[iclass].from_reserved = + (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); + stats->span_use[iclass].map_calls = + (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); + } + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + stats->size_use[iclass].alloc_current = + (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); + stats->size_use[iclass].alloc_peak = + (size_t)heap->size_class_use[iclass].alloc_peak; + stats->size_use[iclass].alloc_total = + (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); + stats->size_use[iclass].free_total = + (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); + stats->size_use[iclass].spans_to_cache = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); + stats->size_use[iclass].spans_from_cache = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); + stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32( + &heap->size_class_use[iclass].spans_from_reserved); + stats->size_use[iclass].map_calls = + (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); + } +#endif +} + +void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + stats->mapped_total = + (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = + (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + stats->huge_alloc = + (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; +#endif +#if ENABLE_GLOBAL_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t *cache = &_memory_span_cache[iclass]; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + uint32_t count = cache->count; +#if ENABLE_UNLIMITED_CACHE + span_t *current_span = cache->overflow; + while (current_span) { + ++count; + current_span = current_span->next; + } +#endif + atomic_store32_release(&cache->lock, 0); + stats->cached += count * (iclass + 1) * _memory_span_size; + } +#endif +} + +#if ENABLE_STATISTICS + +static void _memory_heap_dump_statistics(heap_t *heap, void *file) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize " + "BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB " + "FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) + continue; + fprintf( + file, + "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu " + "%9u\n", + (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + atomic_load32(&heap->size_class_use[iclass].alloc_total), + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + atomic_load32(&heap->size_class_use[iclass].spans_current), + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * + (size_t)_memory_size_class[iclass].block_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * + _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * + _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32( + &heap->size_class_use[iclass].spans_from_reserved) * + _memory_span_size) / + (size_t)(1024 * 1024), + atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); + } + fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB " + "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB " + "FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && + !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + fprintf( + file, + "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", + (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current), + atomic_load32(&heap->span_use[iclass].high), + atomic_load32(&heap->span_use[iclass].spans_deferred), + ((size_t)atomic_load32(&heap->span_use[iclass].high) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), +#if ENABLE_THREAD_CACHE + (unsigned int)(!iclass ? heap->span_cache.count + : heap->span_large_cache[iclass - 1].count), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), +#else + 0, (size_t)0, (size_t)0, +#endif + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * + (iclass + 1) * _memory_span_size) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * + (size_t)_memory_span_size * (iclass + 1)) / + (size_t)(1024 * 1024), + atomic_load32(&heap->span_use[iclass].spans_map_calls)); + } + fprintf(file, "Full spans: %zu\n", heap->full_span_count); + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf( + file, "%17zu %17zu\n", + (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), + (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); +} + +#endif + +void rpmalloc_dump_statistics(void *file) { +#if ENABLE_STATISTICS + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t *heap = _memory_heaps[list_idx]; + while (heap) { + int need_dump = 0; + for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); + ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { + rpmalloc_assert( + !atomic_load32(&heap->size_class_use[iclass].free_total), + "Heap statistics counter mismatch"); + rpmalloc_assert( + !atomic_load32(&heap->size_class_use[iclass].spans_map_calls), + "Heap statistics counter mismatch"); + continue; + } + need_dump = 1; + } + for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); + ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && + !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + need_dump = 1; + } + if (need_dump) + _memory_heap_dump_statistics(heap, file); + heap = heap->next_heap; + } + } + fprintf(file, "Global stats:\n"); + size_t huge_current = + (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; + fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); + fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), + huge_peak / (size_t)(1024 * 1024)); + +#if ENABLE_GLOBAL_CACHE + fprintf(file, "GlobalCacheMiB\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t *cache = _memory_span_cache + iclass; + size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; + + size_t global_overflow_cache = 0; + span_t *span = cache->overflow; + while (span) { + global_overflow_cache += iclass * _memory_span_size; + span = span->next; + } + if (global_cache || global_overflow_cache || cache->insert_count || + cache->extract_count) + fprintf(file, + "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", + iclass + 1, global_cache / (size_t)(1024 * 1024), + global_overflow_cache / (size_t)(1024 * 1024), + cache->insert_count, cache->extract_count); + } +#endif + + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + size_t mapped_os = + (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; + size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + size_t mapped_total = + (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + size_t unmapped_total = + (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + fprintf( + file, + "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", + mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), + mapped_peak / (size_t)(1024 * 1024), + mapped_total / (size_t)(1024 * 1024), + unmapped_total / (size_t)(1024 * 1024)); + + fprintf(file, "\n"); +#if 0 + int64_t allocated = atomic_load64(&_allocation_counter); + int64_t deallocated = atomic_load64(&_deallocation_counter); + fprintf(file, "Allocation count: %lli\n", allocated); + fprintf(file, "Deallocation count: %lli\n", deallocated); + fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); + fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); + fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); +#endif +#endif + (void)sizeof(file); +} + +#if RPMALLOC_FIRST_CLASS_HEAPS + +extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) { + // Must be a pristine heap from newly mapped memory pages, or else memory + // blocks could already be allocated from the heap which would (wrongly) be + // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps + // guaranteed to be pristine from the dedicated orphan list can be used. + heap_t *heap = _rpmalloc_heap_allocate(1); + rpmalloc_assume(heap != NULL); + heap->owner_thread = 0; + _rpmalloc_stat_inc(&_memory_active_heaps); + return heap; +} + +extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) { + if (heap) + _rpmalloc_heap_release(heap, 1, 1); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_allocate(heap, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, + size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) { + return rpmalloc_heap_aligned_calloc(heap, 0, num, size); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, + size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void *block = _rpmalloc_aligned_allocate(heap, alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _rpmalloc_reallocate(heap, ptr, size, 0, flags); +} + +extern inline RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr, + size_t alignment, size_t size, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); +} + +extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) { + (void)sizeof(heap); + _rpmalloc_deallocate(ptr); +} + +extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) { + span_t *span; + span_t *next_span; + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + span = heap->size_class[iclass].partial_span; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->size_class[iclass].partial_span = 0; + span = heap->full_span[iclass]; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + + span = heap->size_class[iclass].cache; + if (span) + _rpmalloc_heap_cache_insert(heap, span); + heap->size_class[iclass].cache = 0; + } + memset(heap->size_class, 0, sizeof(heap->size_class)); + memset(heap->full_span, 0, sizeof(heap->full_span)); + + span = heap->large_huge_span; + while (span) { + next_span = span->next; + if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) + _rpmalloc_deallocate_huge(span); + else + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->large_huge_span = 0; + heap->full_span_count = 0; + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t *span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, + span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, + span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, + span_cache->count); +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + +#if ENABLE_STATISTICS + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); + atomic_store32(&heap->size_class_use[iclass].spans_current, 0); + } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->span_use[iclass].current, 0); + } +#endif +} + +extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) { + heap_t *prev_heap = get_thread_heap_raw(); + if (prev_heap != heap) { + set_thread_heap(heap); + if (prev_heap) + rpmalloc_heap_release(prev_heap); + } +} + +extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) { + // Grab the span, and then the heap from the span + span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask); + if (span) { + return span->heap; + } + return 0; +} + +#endif + +#if ENABLE_PRELOAD || ENABLE_OVERRIDE + +#include "malloc.c" + +#endif + +void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); } diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.h b/llvm/lib/Support/rpmalloc/rpmalloc.h index 5b7fe1ff4286..3911c53b779b 100644 --- a/llvm/lib/Support/rpmalloc/rpmalloc.h +++ b/llvm/lib/Support/rpmalloc/rpmalloc.h @@ -1,428 +1,428 @@ -//===---------------------- rpmalloc.h ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(__clang__) || defined(__GNUC__) -#define RPMALLOC_EXPORT __attribute__((visibility("default"))) -#define RPMALLOC_ALLOCATOR -#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ - (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#else -#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ - __attribute__((alloc_size(count, size))) -#endif -#define RPMALLOC_CDECL -#elif defined(_MSC_VER) -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL __cdecl -#else -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL -#endif - -//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce -// a very small overhead due to some size calculations not being compile time -// constants -#ifndef RPMALLOC_CONFIGURABLE -#define RPMALLOC_CONFIGURABLE 0 -#endif - -//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* -//! functions). -// Will introduce a very small overhead to track fully allocated spans in heaps -#ifndef RPMALLOC_FIRST_CLASS_HEAPS -#define RPMALLOC_FIRST_CLASS_HEAPS 0 -#endif - -//! Flag to rpaligned_realloc to not preserve content in reallocation -#define RPMALLOC_NO_PRESERVE 1 -//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be -//! done in-place, -// in which case the original pointer is still valid (just like a call to -// realloc which failes to allocate a new block). -#define RPMALLOC_GROW_OR_FAIL 2 - -typedef struct rpmalloc_global_statistics_t { - //! Current amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped; - //! Peak amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped_peak; - //! Current amount of memory in global caches for small and medium sizes - //! (<32KiB) - size_t cached; - //! Current amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc; - //! Peak amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc_peak; - //! Total amount of memory mapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t mapped_total; - //! Total amount of memory unmapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t unmapped_total; -} rpmalloc_global_statistics_t; - -typedef struct rpmalloc_thread_statistics_t { - //! Current number of bytes available in thread size class caches for small - //! and medium sizes (<32KiB) - size_t sizecache; - //! Current number of bytes available in thread span caches for small and - //! medium sizes (<32KiB) - size_t spancache; - //! Total number of bytes transitioned from thread cache to global cache (only - //! if ENABLE_STATISTICS=1) - size_t thread_to_global; - //! Total number of bytes transitioned from global cache to thread cache (only - //! if ENABLE_STATISTICS=1) - size_t global_to_thread; - //! Per span count statistics (only if ENABLE_STATISTICS=1) - struct { - //! Currently used number of spans - size_t current; - //! High water mark of spans used - size_t peak; - //! Number of spans transitioned to global cache - size_t to_global; - //! Number of spans transitioned from global cache - size_t from_global; - //! Number of spans transitioned to thread cache - size_t to_cache; - //! Number of spans transitioned from thread cache - size_t from_cache; - //! Number of spans transitioned to reserved state - size_t to_reserved; - //! Number of spans transitioned from reserved state - size_t from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } span_use[64]; - //! Per size class statistics (only if ENABLE_STATISTICS=1) - struct { - //! Current number of allocations - size_t alloc_current; - //! Peak number of allocations - size_t alloc_peak; - //! Total number of allocations - size_t alloc_total; - //! Total number of frees - size_t free_total; - //! Number of spans transitioned to cache - size_t spans_to_cache; - //! Number of spans transitioned from cache - size_t spans_from_cache; - //! Number of spans transitioned from reserved state - size_t spans_from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } size_use[128]; -} rpmalloc_thread_statistics_t; - -typedef struct rpmalloc_config_t { - //! Map memory pages for the given number of bytes. The returned address MUST - //! be - // aligned to the rpmalloc span size, which will always be a power of two. - // Optionally the function can store an alignment offset in the offset - // variable in case it performs alignment and the returned pointer is offset - // from the actual start of the memory region due to this alignment. The - // alignment offset will be passed to the memory unmap function. The - // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), - // if it is you must use natural alignment to shift it into 16 bits. If you - // set a memory_map function, you must also set a memory_unmap function or - // else the default implementation will be used for both. This function must - // be thread safe, it can be called by multiple threads simultaneously. - void *(*memory_map)(size_t size, size_t *offset); - //! Unmap the memory pages starting at address and spanning the given number - //! of bytes. - // If release is set to non-zero, the unmap is for an entire span range as - // returned by a previous call to memory_map and that the entire range should - // be released. The release argument holds the size of the entire span range. - // If release is set to 0, the unmap is a partial decommit of a subset of the - // mapped memory range. If you set a memory_unmap function, you must also set - // a memory_map function or else the default implementation will be used for - // both. This function must be thread safe, it can be called by multiple - // threads simultaneously. - void (*memory_unmap)(void *address, size_t size, size_t offset, - size_t release); - //! Called when an assert fails, if asserts are enabled. Will use the standard - //! assert() - // if this is not set. - void (*error_callback)(const char *message); - //! Called when a call to map memory pages fails (out of memory). If this - //! callback is - // not set or returns zero the library will return a null pointer in the - // allocation call. If this callback returns non-zero the map call will be - // retried. The argument passed is the number of bytes that was requested in - // the map call. Only used if the default system memory map function is used - // (memory_map callback is not set). - int (*map_fail_callback)(size_t size); - //! Size of memory pages. The page size MUST be a power of two. All memory - //! mapping - // requests to memory_map will be made with size set to a multiple of the - // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system - // page size is used. - size_t page_size; - //! Size of a span of memory blocks. MUST be a power of two, and in - //! [4096,262144] - // range (unless 0 - set to 0 to use the default span size). Used if - // RPMALLOC_CONFIGURABLE is defined to 1. - size_t span_size; - //! Number of spans to map at each request to map new virtual memory blocks. - //! This can - // be used to minimize the system call overhead at the cost of virtual memory - // address space. The extra mapped pages will not be written until actually - // used, so physical committed memory should not be affected in the default - // implementation. Will be aligned to a multiple of spans that match memory - // page size in case of huge pages. - size_t span_map_count; - //! Enable use of large/huge pages. If this flag is set to non-zero and page - //! size is - // zero, the allocator will try to enable huge pages and auto detect the - // configuration. If this is set to non-zero and page_size is also non-zero, - // the allocator will assume huge pages have been configured and enabled - // prior to initializing the allocator. For Windows, see - // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support - // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt - int enable_huge_pages; - //! Respectively allocated pages and huge allocated pages names for systems - // supporting it to be able to distinguish among anonymous regions. - const char *page_name; - const char *huge_page_name; -} rpmalloc_config_t; - -//! Initialize allocator with default configuration -RPMALLOC_EXPORT int rpmalloc_initialize(void); - -//! Initialize allocator with given configuration -RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); - -//! Get allocator configuration -RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); - -//! Finalize allocator -RPMALLOC_EXPORT void rpmalloc_finalize(void); - -//! Initialize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); - -//! Finalize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); - -//! Perform deferred deallocations pending for the calling thread heap -RPMALLOC_EXPORT void rpmalloc_thread_collect(void); - -//! Query if allocator is initialized for calling thread -RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); - -//! Get per-thread statistics -RPMALLOC_EXPORT void -rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); - -//! Get global statistics -RPMALLOC_EXPORT void -rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); - -//! Dump all statistics in human readable format to file (should be a FILE*) -RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); - -//! Allocate a memory block of at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); - -//! Free the given memory block -RPMALLOC_EXPORT void rpfree(void *ptr); - -//! Allocate a memory block of at least the given size and zero initialize it -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); - -//! Reallocate the given block to at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Reallocate the given block to at least the given size and alignment, -// with optional control flags (see RPMALLOC_NO_PRESERVE). -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment, and zero -//! initialize it. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_calloc(size_t alignment, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, - size_t size); - -//! Query the usable size of the given memory block (from given pointer to the -//! end of block) -RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); - -//! Dummy empty function for forcing linker symbol inclusion -RPMALLOC_EXPORT void rpmalloc_linker_reference(void); - -#if RPMALLOC_FIRST_CLASS_HEAPS - -//! Heap type -typedef struct heap_t rpmalloc_heap_t; - -//! Acquire a new heap. Will reuse existing released heaps or allocate memory -//! for a new heap -// if none available. Heap API is implemented with the strict assumption that -// only one single thread will call heap functions for a given heap at any -// given time, no functions are thread safe. -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); - -//! Release a heap (does NOT free the memory allocated by the heap, use -//! rpmalloc_heap_free_all before destroying the heap). -// Releasing a heap will enable it to be reused by other threads. Safe to pass -// a null pointer. -RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); - -//! Allocate a memory block of at least the given size using the given heap. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size using the given heap. The -//! returned -// block will have the requested alignment. Alignment must be a power of two -// and a multiple of sizeof(void*), and should ideally be less than memory page -// size. A caveat of rpmalloc internals is that this must also be strictly less -// than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. The returned -// block will have the requested alignment. Alignment must either be zero, or a -// power of two and a multiple of sizeof(void*), and should ideally be less -// than memory page size. A caveat of rpmalloc internals is that this must also -// be strictly less than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, - size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. The returned block will have the -// requested alignment. Alignment must be either zero, or a power of two and a -// multiple of sizeof(void*), and should ideally be less than memory page size. -// A caveat of rpmalloc internals is that this must also be strictly less than -// the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( - rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); - -//! Free the given memory block from the given heap. The memory block MUST be -//! allocated -// by the same heap given to this function. -RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); - -//! Free all memory allocated by the heap -RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); - -//! Set the given heap as the current heap for the calling thread. A heap MUST -//! only be current heap -// for a single thread, a heap can never be shared between multiple threads. -// The previous current heap for the calling thread is released to be reused by -// other threads. -RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); - -//! Returns which heap the given pointer is allocated on -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); - -#endif - -#ifdef __cplusplus -} -#endif +//===---------------------- rpmalloc.h ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define RPMALLOC_EXPORT __attribute__((visibility("default"))) +#define RPMALLOC_ALLOCATOR +#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ + (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#else +#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ + __attribute__((alloc_size(count, size))) +#endif +#define RPMALLOC_CDECL +#elif defined(_MSC_VER) +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL __cdecl +#else +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL +#endif + +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce +// a very small overhead due to some size calculations not being compile time +// constants +#ifndef RPMALLOC_CONFIGURABLE +#define RPMALLOC_CONFIGURABLE 0 +#endif + +//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* +//! functions). +// Will introduce a very small overhead to track fully allocated spans in heaps +#ifndef RPMALLOC_FIRST_CLASS_HEAPS +#define RPMALLOC_FIRST_CLASS_HEAPS 0 +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 +//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be +//! done in-place, +// in which case the original pointer is still valid (just like a call to +// realloc which failes to allocate a new block). +#define RPMALLOC_GROW_OR_FAIL 2 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Peak amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped_peak; + //! Current amount of memory in global caches for small and medium sizes + //! (<32KiB) + size_t cached; + //! Current amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc; + //! Peak amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc_peak; + //! Total amount of memory mapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available in thread size class caches for small + //! and medium sizes (<32KiB) + size_t sizecache; + //! Current number of bytes available in thread span caches for small and + //! medium sizes (<32KiB) + size_t spancache; + //! Total number of bytes transitioned from thread cache to global cache (only + //! if ENABLE_STATISTICS=1) + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache (only + //! if ENABLE_STATISTICS=1) + size_t global_to_thread; + //! Per span count statistics (only if ENABLE_STATISTICS=1) + struct { + //! Currently used number of spans + size_t current; + //! High water mark of spans used + size_t peak; + //! Number of spans transitioned to global cache + size_t to_global; + //! Number of spans transitioned from global cache + size_t from_global; + //! Number of spans transitioned to thread cache + size_t to_cache; + //! Number of spans transitioned from thread cache + size_t from_cache; + //! Number of spans transitioned to reserved state + size_t to_reserved; + //! Number of spans transitioned from reserved state + size_t from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } span_use[64]; + //! Per size class statistics (only if ENABLE_STATISTICS=1) + struct { + //! Current number of allocations + size_t alloc_current; + //! Peak number of allocations + size_t alloc_peak; + //! Total number of allocations + size_t alloc_total; + //! Total number of frees + size_t free_total; + //! Number of spans transitioned to cache + size_t spans_to_cache; + //! Number of spans transitioned from cache + size_t spans_from_cache; + //! Number of spans transitioned from reserved state + size_t spans_from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } size_use[128]; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST + //! be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset + // variable in case it performs alignment and the returned pointer is offset + // from the actual start of the memory region due to this alignment. The + // alignment offset will be passed to the memory unmap function. The + // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), + // if it is you must use natural alignment to shift it into 16 bits. If you + // set a memory_map function, you must also set a memory_unmap function or + // else the default implementation will be used for both. This function must + // be thread safe, it can be called by multiple threads simultaneously. + void *(*memory_map)(size_t size, size_t *offset); + //! Unmap the memory pages starting at address and spanning the given number + //! of bytes. + // If release is set to non-zero, the unmap is for an entire span range as + // returned by a previous call to memory_map and that the entire range should + // be released. The release argument holds the size of the entire span range. + // If release is set to 0, the unmap is a partial decommit of a subset of the + // mapped memory range. If you set a memory_unmap function, you must also set + // a memory_map function or else the default implementation will be used for + // both. This function must be thread safe, it can be called by multiple + // threads simultaneously. + void (*memory_unmap)(void *address, size_t size, size_t offset, + size_t release); + //! Called when an assert fails, if asserts are enabled. Will use the standard + //! assert() + // if this is not set. + void (*error_callback)(const char *message); + //! Called when a call to map memory pages fails (out of memory). If this + //! callback is + // not set or returns zero the library will return a null pointer in the + // allocation call. If this callback returns non-zero the map call will be + // retried. The argument passed is the number of bytes that was requested in + // the map call. Only used if the default system memory map function is used + // (memory_map callback is not set). + int (*map_fail_callback)(size_t size); + //! Size of memory pages. The page size MUST be a power of two. All memory + //! mapping + // requests to memory_map will be made with size set to a multiple of the + // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system + // page size is used. + size_t page_size; + //! Size of a span of memory blocks. MUST be a power of two, and in + //! [4096,262144] + // range (unless 0 - set to 0 to use the default span size). Used if + // RPMALLOC_CONFIGURABLE is defined to 1. + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. + //! This can + // be used to minimize the system call overhead at the cost of virtual memory + // address space. The extra mapped pages will not be written until actually + // used, so physical committed memory should not be affected in the default + // implementation. Will be aligned to a multiple of spans that match memory + // page size in case of huge pages. + size_t span_map_count; + //! Enable use of large/huge pages. If this flag is set to non-zero and page + //! size is + // zero, the allocator will try to enable huge pages and auto detect the + // configuration. If this is set to non-zero and page_size is also non-zero, + // the allocator will assume huge pages have been configured and enabled + // prior to initializing the allocator. For Windows, see + // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support + // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + int enable_huge_pages; + //! Respectively allocated pages and huge allocated pages names for systems + // supporting it to be able to distinguish among anonymous regions. + const char *page_name; + const char *huge_page_name; +} rpmalloc_config_t; + +//! Initialize allocator with default configuration +RPMALLOC_EXPORT int rpmalloc_initialize(void); + +//! Initialize allocator with given configuration +RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); + +//! Get allocator configuration +RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); + +//! Finalize allocator +RPMALLOC_EXPORT void rpmalloc_finalize(void); + +//! Initialize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); + +//! Finalize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); + +//! Perform deferred deallocations pending for the calling thread heap +RPMALLOC_EXPORT void rpmalloc_thread_collect(void); + +//! Query if allocator is initialized for calling thread +RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); + +//! Get per-thread statistics +RPMALLOC_EXPORT void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); + +//! Get global statistics +RPMALLOC_EXPORT void +rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); + +//! Dump all statistics in human readable format to file (should be a FILE*) +RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); + +//! Allocate a memory block of at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); + +//! Free the given memory block +RPMALLOC_EXPORT void rpfree(void *ptr); + +//! Allocate a memory block of at least the given size and zero initialize it +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); + +//! Reallocate the given block to at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Reallocate the given block to at least the given size and alignment, +// with optional control flags (see RPMALLOC_NO_PRESERVE). +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment, and zero +//! initialize it. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_calloc(size_t alignment, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, + size_t size); + +//! Query the usable size of the given memory block (from given pointer to the +//! end of block) +RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); + +//! Dummy empty function for forcing linker symbol inclusion +RPMALLOC_EXPORT void rpmalloc_linker_reference(void); + +#if RPMALLOC_FIRST_CLASS_HEAPS + +//! Heap type +typedef struct heap_t rpmalloc_heap_t; + +//! Acquire a new heap. Will reuse existing released heaps or allocate memory +//! for a new heap +// if none available. Heap API is implemented with the strict assumption that +// only one single thread will call heap functions for a given heap at any +// given time, no functions are thread safe. +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); + +//! Release a heap (does NOT free the memory allocated by the heap, use +//! rpmalloc_heap_free_all before destroying the heap). +// Releasing a heap will enable it to be reused by other threads. Safe to pass +// a null pointer. +RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); + +//! Allocate a memory block of at least the given size using the given heap. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size using the given heap. The +//! returned +// block will have the requested alignment. Alignment must be a power of two +// and a multiple of sizeof(void*), and should ideally be less than memory page +// size. A caveat of rpmalloc internals is that this must also be strictly less +// than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. The returned +// block will have the requested alignment. Alignment must either be zero, or a +// power of two and a multiple of sizeof(void*), and should ideally be less +// than memory page size. A caveat of rpmalloc internals is that this must also +// be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, + size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. The returned block will have the +// requested alignment. Alignment must be either zero, or a power of two and a +// multiple of sizeof(void*), and should ideally be less than memory page size. +// A caveat of rpmalloc internals is that this must also be strictly less than +// the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( + rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); + +//! Free the given memory block from the given heap. The memory block MUST be +//! allocated +// by the same heap given to this function. +RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); + +//! Free all memory allocated by the heap +RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); + +//! Set the given heap as the current heap for the calling thread. A heap MUST +//! only be current heap +// for a single thread, a heap can never be shared between multiple threads. +// The previous current heap for the calling thread is released to be reused by +// other threads. +RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); + +//! Returns which heap the given pointer is allocated on +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/llvm/lib/Support/rpmalloc/rpnew.h b/llvm/lib/Support/rpmalloc/rpnew.h index a18f0799d56d..d8303c6f9565 100644 --- a/llvm/lib/Support/rpmalloc/rpnew.h +++ b/llvm/lib/Support/rpmalloc/rpnew.h @@ -1,113 +1,113 @@ -//===-------------------------- rpnew.h -----------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#ifdef __cplusplus - -#include -#include - -#ifndef __CRTDECL -#define __CRTDECL -#endif - -extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } - -extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } - -extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new(std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -#if (__cplusplus >= 201402L || _MSC_VER >= 1916) - -extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -#endif - -#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) - -extern void __CRTDECL operator delete(void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete(void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void *__CRTDECL operator new(std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -#endif - -#endif +//===-------------------------- rpnew.h -----------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#ifdef __cplusplus + +#include +#include + +#ifndef __CRTDECL +#define __CRTDECL +#endif + +extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } + +extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } + +extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new(std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +#if (__cplusplus >= 201402L || _MSC_VER >= 1916) + +extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +#endif + +#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) + +extern void __CRTDECL operator delete(void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete(void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void *__CRTDECL operator new(std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +#endif + +#endif diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 9844fd394aa4..8ea31401121b 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -1,38 +1,38 @@ -//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ -//-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -//===----------------------------------------------------------------------===// - -#include "DirectXTargetTransformInfo.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IntrinsicsDirectX.h" - -using namespace llvm; - -bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { - switch (ID) { - case Intrinsic::dx_wave_readlane: - return ScalarOpdIdx == 1; - default: - return false; - } -} - -bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( - Intrinsic::ID ID) const { - switch (ID) { - case Intrinsic::dx_frac: - case Intrinsic::dx_rsqrt: - case Intrinsic::dx_wave_readlane: - return true; - default: - return false; - } -} +//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +//===----------------------------------------------------------------------===// + +#include "DirectXTargetTransformInfo.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsDirectX.h" + +using namespace llvm; + +bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { + switch (ID) { + case Intrinsic::dx_wave_readlane: + return ScalarOpdIdx == 1; + default: + return false; + } +} + +bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( + Intrinsic::ID ID) const { + switch (ID) { + case Intrinsic::dx_frac: + case Intrinsic::dx_rsqrt: + case Intrinsic::dx_wave_readlane: + return true; + default: + return false; + } +} diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll index b2c650d11626..9d86f87f3ed5 100644 --- a/llvm/test/CodeGen/DirectX/atan2.ll +++ b/llvm/test/CodeGen/DirectX/atan2.ll @@ -1,87 +1,87 @@ -; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK -; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK - -; Make sure correct dxil expansions for atan2 are generated for float and half. - -define noundef float @atan2_float(float noundef %y, float noundef %x) { -entry: -; CHECK: [[DIV:%.+]] = fdiv float %y, %x -; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]]) -; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]]) -; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000 -; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000 -; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00 -; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 -; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 -; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00 -; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] -; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]] -; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] -; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]] -; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] -; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]] -; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] -; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]] -; CHECK: ret float [[SELECT_HPI]] - %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x) - ret float %elt.atan2 -} - -define noundef half @atan2_half(half noundef %y, half noundef %x) { -entry: -; CHECK: [[DIV:%.+]] = fdiv half %y, %x -; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]]) -; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]]) -; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248 -; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248 -; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000 -; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 -; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 -; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000 -; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] -; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]] -; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] -; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]] -; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] -; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]] -; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] -; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]] -; CHECK: ret half [[SELECT_HPI]] - %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x) - ret half %elt.atan2 -} - -define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) { -entry: -; Just Expansion, no scalarization or lowering: -; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x -; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]]) -; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], -; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], -; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer -; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer -; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer -; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer -; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]] -; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]] -; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]] -; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]] -; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]] -; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> , <4 x float> [[SELECT_SUB_PI]] -; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]] -; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> , <4 x float> [[SELECT_NEGHPI]] -; EXPCHECK: ret <4 x float> [[SELECT_HPI]] - -; Scalarization occurs after expansion, so atan scalarization is tested separately. -; Expansion, scalarization and lowering: -; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls. -; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}}) -; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17, - - %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x) - ret <4 x float> %elt.atan2 -} - -declare half @llvm.atan2.f16(half, half) -declare float @llvm.atan2.f32(float, float) -declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK + +; Make sure correct dxil expansions for atan2 are generated for float and half. + +define noundef float @atan2_float(float noundef %y, float noundef %x) { +entry: +; CHECK: [[DIV:%.+]] = fdiv float %y, %x +; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]]) +; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]]) +; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000 +; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000 +; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00 +; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 +; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 +; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00 +; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] +; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]] +; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] +; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]] +; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] +; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]] +; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] +; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]] +; CHECK: ret float [[SELECT_HPI]] + %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x) + ret float %elt.atan2 +} + +define noundef half @atan2_half(half noundef %y, half noundef %x) { +entry: +; CHECK: [[DIV:%.+]] = fdiv half %y, %x +; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]]) +; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]]) +; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248 +; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248 +; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000 +; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 +; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 +; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000 +; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]] +; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]] +; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]] +; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]] +; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]] +; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]] +; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]] +; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]] +; CHECK: ret half [[SELECT_HPI]] + %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x) + ret half %elt.atan2 +} + +define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) { +entry: +; Just Expansion, no scalarization or lowering: +; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x +; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]]) +; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], +; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], +; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer +; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer +; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer +; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer +; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]] +; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]] +; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]] +; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]] +; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]] +; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> , <4 x float> [[SELECT_SUB_PI]] +; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]] +; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> , <4 x float> [[SELECT_NEGHPI]] +; EXPCHECK: ret <4 x float> [[SELECT_HPI]] + +; Scalarization occurs after expansion, so atan scalarization is tested separately. +; Expansion, scalarization and lowering: +; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls. +; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}}) +; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17, + + %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x) + ret <4 x float> %elt.atan2 +} + +declare half @llvm.atan2.f16(half, half) +declare float @llvm.atan2.f32(float, float) +declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll index 9b66f9f1dd45..372934098b7c 100644 --- a/llvm/test/CodeGen/DirectX/atan2_error.ll +++ b/llvm/test/CodeGen/DirectX/atan2_error.ll @@ -1,11 +1,11 @@ -; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s - -; DXIL operation atan does not support double overload type -; CHECK: in function atan2_double -; CHECK-SAME: Cannot create ATan operation: Invalid overload type - -define noundef double @atan2_double(double noundef %a, double noundef %b) #0 { -entry: - %1 = call double @llvm.atan2.f64(double %a, double %b) - ret double %1 -} +; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s + +; DXIL operation atan does not support double overload type +; CHECK: in function atan2_double +; CHECK-SAME: Cannot create ATan operation: Invalid overload type + +define noundef double @atan2_double(double noundef %a, double noundef %b) #0 { +entry: + %1 = call double @llvm.atan2.f64(double %a, double %b) + ret double %1 +} diff --git a/llvm/test/CodeGen/DirectX/cross.ll b/llvm/test/CodeGen/DirectX/cross.ll index 6153cf7cddc9..6ec3ec4d3594 100644 --- a/llvm/test/CodeGen/DirectX/cross.ll +++ b/llvm/test/CodeGen/DirectX/cross.ll @@ -1,56 +1,56 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s - -; Make sure dxil operation function calls for cross are generated for half/float. - -declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>) -declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>) - -define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { -entry: - ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0 - ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1 - ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2 - ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0 - ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1 - ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2 - ; CHECK: %0 = fmul half %x1, %y2 - ; CHECK: %1 = fmul half %x2, %y1 - ; CHECK: %hlsl.cross1 = fsub half %0, %1 - ; CHECK: %2 = fmul half %x2, %y0 - ; CHECK: %3 = fmul half %x0, %y2 - ; CHECK: %hlsl.cross2 = fsub half %2, %3 - ; CHECK: %4 = fmul half %x0, %y1 - ; CHECK: %5 = fmul half %x1, %y0 - ; CHECK: %hlsl.cross3 = fsub half %4, %5 - ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0 - ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1 - ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2 - ; CHECK: ret <3 x half> %8 - %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1) - ret <3 x half> %hlsl.cross -} - -define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { -entry: - ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0 - ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1 - ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2 - ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0 - ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1 - ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2 - ; CHECK: %0 = fmul float %x1, %y2 - ; CHECK: %1 = fmul float %x2, %y1 - ; CHECK: %hlsl.cross1 = fsub float %0, %1 - ; CHECK: %2 = fmul float %x2, %y0 - ; CHECK: %3 = fmul float %x0, %y2 - ; CHECK: %hlsl.cross2 = fsub float %2, %3 - ; CHECK: %4 = fmul float %x0, %y1 - ; CHECK: %5 = fmul float %x1, %y0 - ; CHECK: %hlsl.cross3 = fsub float %4, %5 - ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0 - ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1 - ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2 - ; CHECK: ret <3 x float> %8 - %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1) - ret <3 x float> %hlsl.cross -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s + +; Make sure dxil operation function calls for cross are generated for half/float. + +declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>) +declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>) + +define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { +entry: + ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0 + ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1 + ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2 + ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0 + ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1 + ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2 + ; CHECK: %0 = fmul half %x1, %y2 + ; CHECK: %1 = fmul half %x2, %y1 + ; CHECK: %hlsl.cross1 = fsub half %0, %1 + ; CHECK: %2 = fmul half %x2, %y0 + ; CHECK: %3 = fmul half %x0, %y2 + ; CHECK: %hlsl.cross2 = fsub half %2, %3 + ; CHECK: %4 = fmul half %x0, %y1 + ; CHECK: %5 = fmul half %x1, %y0 + ; CHECK: %hlsl.cross3 = fsub half %4, %5 + ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0 + ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1 + ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2 + ; CHECK: ret <3 x half> %8 + %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1) + ret <3 x half> %hlsl.cross +} + +define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { +entry: + ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0 + ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1 + ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2 + ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0 + ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1 + ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2 + ; CHECK: %0 = fmul float %x1, %y2 + ; CHECK: %1 = fmul float %x2, %y1 + ; CHECK: %hlsl.cross1 = fsub float %0, %1 + ; CHECK: %2 = fmul float %x2, %y0 + ; CHECK: %3 = fmul float %x0, %y2 + ; CHECK: %hlsl.cross2 = fsub float %2, %3 + ; CHECK: %4 = fmul float %x0, %y1 + ; CHECK: %5 = fmul float %x1, %y0 + ; CHECK: %hlsl.cross3 = fsub float %4, %5 + ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0 + ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1 + ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2 + ; CHECK: ret <3 x float> %8 + %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1) + ret <3 x float> %hlsl.cross +} diff --git a/llvm/test/CodeGen/DirectX/normalize.ll b/llvm/test/CodeGen/DirectX/normalize.ll index de106be12437..2aba9d5f74d7 100644 --- a/llvm/test/CodeGen/DirectX/normalize.ll +++ b/llvm/test/CodeGen/DirectX/normalize.ll @@ -1,112 +1,112 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK -; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK - -; Make sure dxil operation function calls for normalize are generated for half/float. - -declare half @llvm.dx.normalize.f16(half) -declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>) -declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>) -declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>) - -declare float @llvm.dx.normalize.f32(float) -declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>) -declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>) -declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>) - -define noundef half @test_normalize_half(half noundef %p0) { -entry: - ; CHECK: fdiv half %p0, %p0 - %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0) - ret half %hlsl.normalize -} - -define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) - ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]]) - ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer - ; CHECK: fmul <2 x half> %p0, [[splat]] - - %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0) - ret <2 x half> %hlsl.normalize -} - -define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) - ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]]) - ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer - ; CHECK: fmul <3 x half> %p0, %.splat - - %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0) - ret <3 x half> %hlsl.normalize -} - -define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) { -entry: - ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) - ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]]) - ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]]) - ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer - ; CHECK: fmul <4 x half> %p0, %.splat - - %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0) - ret <4 x half> %hlsl.normalize -} - -define noundef float @test_normalize_float(float noundef %p0) { -entry: - ; CHECK: fdiv float %p0, %p0 - %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0) - ret float %hlsl.normalize -} - -define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) - ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]]) - ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer - ; CHECK: fmul <2 x float> %p0, %.splat - - %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0) - ret <2 x float> %hlsl.normalize -} - -define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) - ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]]) - ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer - ; CHECK: fmul <3 x float> %p0, %.splat - - %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0) - ret <3 x float> %hlsl.normalize -} - -define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) { -entry: - ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) - ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]]) - ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]]) - ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0 - ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer - ; CHECK: fmul <4 x float> %p0, %.splat - - %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0) - ret <4 x float> %hlsl.normalize -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK +; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK + +; Make sure dxil operation function calls for normalize are generated for half/float. + +declare half @llvm.dx.normalize.f16(half) +declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>) +declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>) +declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>) + +declare float @llvm.dx.normalize.f32(float) +declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>) +declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>) +declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>) + +define noundef half @test_normalize_half(half noundef %p0) { +entry: + ; CHECK: fdiv half %p0, %p0 + %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0) + ret half %hlsl.normalize +} + +define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]]) + ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer + ; CHECK: fmul <2 x half> %p0, [[splat]] + + %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0) + ret <2 x half> %hlsl.normalize +} + +define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) + ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]]) + ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer + ; CHECK: fmul <3 x half> %p0, %.splat + + %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0) + ret <3 x half> %hlsl.normalize +} + +define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) { +entry: + ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) + ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]]) + ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]]) + ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer + ; CHECK: fmul <4 x half> %p0, %.splat + + %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0) + ret <4 x half> %hlsl.normalize +} + +define noundef float @test_normalize_float(float noundef %p0) { +entry: + ; CHECK: fdiv float %p0, %p0 + %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0) + ret float %hlsl.normalize +} + +define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) + ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]]) + ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer + ; CHECK: fmul <2 x float> %p0, %.splat + + %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0) + ret <2 x float> %hlsl.normalize +} + +define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) + ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]]) + ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer + ; CHECK: fmul <3 x float> %p0, %.splat + + %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0) + ret <3 x float> %hlsl.normalize +} + +define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) { +entry: + ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) + ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) + ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]]) + ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]]) + ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0 + ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer + ; CHECK: fmul <4 x float> %p0, %.splat + + %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0) + ret <4 x float> %hlsl.normalize +} diff --git a/llvm/test/CodeGen/DirectX/normalize_error.ll b/llvm/test/CodeGen/DirectX/normalize_error.ll index 3041d2ecdd92..35a91c0cdc24 100644 --- a/llvm/test/CodeGen/DirectX/normalize_error.ll +++ b/llvm/test/CodeGen/DirectX/normalize_error.ll @@ -1,10 +1,10 @@ -; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s - -; DXIL operation normalize does not support double overload type -; CHECK: Cannot create Dot2 operation: Invalid overload type - -define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) { -entry: - %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0) - ret <2 x double> %hlsl.normalize -} +; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s + +; DXIL operation normalize does not support double overload type +; CHECK: Cannot create Dot2 operation: Invalid overload type + +define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) { +entry: + %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0) + ret <2 x double> %hlsl.normalize +} diff --git a/llvm/test/CodeGen/DirectX/step.ll b/llvm/test/CodeGen/DirectX/step.ll index 6a9b5bf71da8..1c9894026c62 100644 --- a/llvm/test/CodeGen/DirectX/step.ll +++ b/llvm/test/CodeGen/DirectX/step.ll @@ -1,78 +1,78 @@ -; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK - -; Make sure dxil operation function calls for step are generated for half/float. - -declare half @llvm.dx.step.f16(half, half) -declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>) -declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>) -declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>) - -declare float @llvm.dx.step.f32(float, float) -declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>) -declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>) -declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>) - -define noundef half @test_step_half(half noundef %p0, half noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt half %p1, %p0 - ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00 - %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1) - ret half %hlsl.step -} - -define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0 - ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> - %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1) - ret <2 x half> %hlsl.step -} - -define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0 - ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> - %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1) - ret <3 x half> %hlsl.step -} - -define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0 - ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> - %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1) - ret <4 x half> %hlsl.step -} - -define noundef float @test_step_float(float noundef %p0, float noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt float %p1, %p0 - ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00 - %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1) - ret float %hlsl.step -} - -define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0 - ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> - %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1) - ret <2 x float> %hlsl.step -} - -define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0 - ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> - %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1) - ret <3 x float> %hlsl.step -} - -define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) { -entry: - ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0 - ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> - %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1) - ret <4 x float> %hlsl.step -} +; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK + +; Make sure dxil operation function calls for step are generated for half/float. + +declare half @llvm.dx.step.f16(half, half) +declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>) +declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>) +declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>) + +declare float @llvm.dx.step.f32(float, float) +declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>) +declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>) +declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>) + +define noundef half @test_step_half(half noundef %p0, half noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt half %p1, %p0 + ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00 + %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1) + ret half %hlsl.step +} + +define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0 + ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> + %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1) + ret <2 x half> %hlsl.step +} + +define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0 + ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> + %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1) + ret <3 x half> %hlsl.step +} + +define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0 + ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> + %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1) + ret <4 x half> %hlsl.step +} + +define noundef float @test_step_float(float noundef %p0, float noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt float %p1, %p0 + ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00 + %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1) + ret float %hlsl.step +} + +define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0 + ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> + %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1) + ret <2 x float> %hlsl.step +} + +define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0 + ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> + %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1) + ret <3 x float> %hlsl.step +} + +define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) { +entry: + ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0 + ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> + %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1) + ret <4 x float> %hlsl.step +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll index a0306bae4a22..bdbfc133efa2 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll @@ -1,49 +1,49 @@ -; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 - -define noundef float @atan2_float(float noundef %a, float noundef %b) { -entry: -; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b) - ret float %elt.atan2 -} - -define noundef half @atan2_half(half noundef %a, half noundef %b) { -entry: -; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] -; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b) - ret half %elt.atan2 -} - -define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %elt.atan2 -} - -define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] - %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b) - ret <4 x half> %elt.atan2 -} - -declare half @llvm.atan2.f16(half, half) -declare float @llvm.atan2.f32(float, float) -declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 + +define noundef float @atan2_float(float noundef %a, float noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b) + ret float %elt.atan2 +} + +define noundef half @atan2_half(half noundef %a, half noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b) + ret half %elt.atan2 +} + +define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %elt.atan2 +} + +define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]] + %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %elt.atan2 +} + +declare half @llvm.atan2.f16(half, half) +declare float @llvm.atan2.f32(float, float) +declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll index 7c06c14bb968..2e0eb8c429ac 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll @@ -1,33 +1,33 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for cross are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 -; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 - -define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] - %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b) - ret <3 x half> %hlsl.cross -} - -define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] - %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b) - ret <3 x float> %hlsl.cross -} - -declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>) -declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for cross are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 +; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 + +define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] + %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b) + ret <3 x half> %hlsl.cross +} + +define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]] + %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b) + ret <3 x float> %hlsl.cross +} + +declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>) +declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll index df1ef3a7287c..b4a9d8e0664b 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll @@ -1,29 +1,29 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for length are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef half @length_half4(<4 x half> noundef %a) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]] - %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a) - ret half %hlsl.length -} - -define noundef float @length_float4(<4 x float> noundef %a) { -entry: - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] - ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]] - %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a) - ret float %hlsl.length -} - -declare half @llvm.spv.length.v4f16(<4 x half>) -declare float @llvm.spv.length.v4f32(<4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for length are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef half @length_half4(<4 x half> noundef %a) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]] + %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a) + ret half %hlsl.length +} + +define noundef float @length_float4(<4 x float> noundef %a) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]] + %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a) + ret float %hlsl.length +} + +declare half @llvm.spv.length.v4f16(<4 x half>) +declare float @llvm.spv.length.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll index 4659b5146e43..fa73b9c2a4d3 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll @@ -1,31 +1,31 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for normalize are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]] - %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a) - ret <4 x half> %hlsl.normalize -} - -define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]] - %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a) - ret <4 x float> %hlsl.normalize -} - -declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>) -declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for normalize are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]] + %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a) + ret <4 x half> %hlsl.normalize +} + +define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]] + %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a) + ret <4 x float> %hlsl.normalize +} + +declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>) +declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll index 7c0ee9398d15..bb50d8c790f8 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll @@ -1,33 +1,33 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} - -; Make sure SPIRV operation function calls for step are lowered correctly. - -; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" -; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 -; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 -; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 -; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 - -define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] - %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b) - ret <4 x half> %hlsl.step -} - -define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) { -entry: - ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] - ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] - ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] - %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b) - ret <4 x float> %hlsl.step -} - -declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>) +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for step are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] + %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %hlsl.step +} + +define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]] + %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %hlsl.step +} + +declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/Demangle/ms-placeholder-return-type.test b/llvm/test/Demangle/ms-placeholder-return-type.test index a656400fe140..18038e636c8d 100644 --- a/llvm/test/Demangle/ms-placeholder-return-type.test +++ b/llvm/test/Demangle/ms-placeholder-return-type.test @@ -1,18 +1,18 @@ -; RUN: llvm-undname < %s | FileCheck %s - -; CHECK-NOT: Invalid mangled name - -?TestNonTemplateAuto@@YA@XZ -; CHECK: __cdecl TestNonTemplateAuto(void) - -??$AutoT@X@@YA?A_PXZ -; CHECK: auto __cdecl AutoT(void) - -??$AutoT@X@@YA?B_PXZ -; CHECK: auto const __cdecl AutoT(void) - -??$AutoT@X@@YA?A_TXZ -; CHECK: decltype(auto) __cdecl AutoT(void) - -??$AutoT@X@@YA?B_TXZ -; CHECK: decltype(auto) const __cdecl AutoT(void) +; RUN: llvm-undname < %s | FileCheck %s + +; CHECK-NOT: Invalid mangled name + +?TestNonTemplateAuto@@YA@XZ +; CHECK: __cdecl TestNonTemplateAuto(void) + +??$AutoT@X@@YA?A_PXZ +; CHECK: auto __cdecl AutoT(void) + +??$AutoT@X@@YA?B_PXZ +; CHECK: auto const __cdecl AutoT(void) + +??$AutoT@X@@YA?A_TXZ +; CHECK: decltype(auto) __cdecl AutoT(void) + +??$AutoT@X@@YA?B_TXZ +; CHECK: decltype(auto) const __cdecl AutoT(void) diff --git a/llvm/test/FileCheck/dos-style-eol.txt b/llvm/test/FileCheck/dos-style-eol.txt index 52184f465c3f..4252aad4d3e7 100644 --- a/llvm/test/FileCheck/dos-style-eol.txt +++ b/llvm/test/FileCheck/dos-style-eol.txt @@ -1,11 +1,11 @@ -// Test for using FileCheck on DOS style end-of-line -// This test was deliberately committed with DOS style end of line. -// Don't change line endings! -// RUN: FileCheck -input-file %s %s -// RUN: FileCheck --strict-whitespace -input-file %s %s - -LINE 1 -; CHECK: {{^}}LINE 1{{$}} - -LINE 2 +// Test for using FileCheck on DOS style end-of-line +// This test was deliberately committed with DOS style end of line. +// Don't change line endings! +// RUN: FileCheck -input-file %s %s +// RUN: FileCheck --strict-whitespace -input-file %s %s + +LINE 1 +; CHECK: {{^}}LINE 1{{$}} + +LINE 2 ; CHECK: {{^}}LINE 2{{$}} \ No newline at end of file diff --git a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri index 857c4ff87b6c..72d23d041ae8 100644 --- a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri +++ b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri @@ -1,4 +1,4 @@ -; this file intentionally has crlf line endings -create crlf.a -addmod foo.txt -end +; this file intentionally has crlf line endings +create crlf.a +addmod foo.txt +end diff --git a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc index 82031d0e2083..081b3a77bebc 100644 --- a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc +++ b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc @@ -1,36 +1,36 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US -randomdat RCDATA -{ - "this is a random bit of data that means nothing\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -randomdat RCDATA -{ - "zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG -randomdat RCDATA -{ - "Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0", - 0x23a9, - 0x140e, - 194292, -} - -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US +randomdat RCDATA +{ + "this is a random bit of data that means nothing\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +randomdat RCDATA +{ + "zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG +randomdat RCDATA +{ + "Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0", + 0x23a9, + 0x140e, + 194292, +} + +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} diff --git a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc index 494849f57a0a..5ca097baa0f7 100644 --- a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc +++ b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc @@ -1,50 +1,50 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US - -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -cursor BITMAP "cursor_small.bmp" -okay BITMAP "okay_small.bmp" - -14432 MENU -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -{ - MENUITEM "yu", 100 - MENUITEM "shala", 101 - MENUITEM "kaoya", 102 -} - -testdialog DIALOG 10, 10, 200, 300 -STYLE WS_POPUP | WS_BORDER -CAPTION "Test" -{ - CTEXT "Continue:", 1, 10, 10, 230, 14 - PUSHBUTTON "&OK", 2, 66, 134, 161, 13 -} - -12 ACCELERATORS -{ - "X", 164, VIRTKEY, ALT - "H", 5678, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -"eat" MENU -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS -{ - MENUITEM "fish", 100 - MENUITEM "salad", 101 - MENUITEM "duck", 102 -} - - -myresource stringarray { - "this is a user defined resource\0", - "it contains many strings\0", +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US + +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +cursor BITMAP "cursor_small.bmp" +okay BITMAP "okay_small.bmp" + +14432 MENU +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +{ + MENUITEM "yu", 100 + MENUITEM "shala", 101 + MENUITEM "kaoya", 102 +} + +testdialog DIALOG 10, 10, 200, 300 +STYLE WS_POPUP | WS_BORDER +CAPTION "Test" +{ + CTEXT "Continue:", 1, 10, 10, 230, 14 + PUSHBUTTON "&OK", 2, 66, 134, 161, 13 +} + +12 ACCELERATORS +{ + "X", 164, VIRTKEY, ALT + "H", 5678, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +"eat" MENU +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS +{ + MENUITEM "fish", 100 + MENUITEM "salad", 101 + MENUITEM "duck", 102 +} + + +myresource stringarray { + "this is a user defined resource\0", + "it contains many strings\0", } \ No newline at end of file diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc index c700b587af64..bb79dca399c2 100644 --- a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc +++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc @@ -1,16 +1,16 @@ -101 DIALOG 0, 0, 362, 246 -STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l | - 0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l | - 0x00080000l | 0x00040000l -CAPTION "MakeNSISW" -MENU 104 -FONT 8, "MS Shell Dlg" -BEGIN - CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l | - 0x0100l | 0x0800l | 0x00008000 | - 0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190 - CONTROL "",-1,"Static",0x00000010l,7,220,346,1 - LTEXT "",200,7,230,200,12,0x08000000l - DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l - PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l -END +101 DIALOG 0, 0, 362, 246 +STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l | + 0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l | + 0x00080000l | 0x00040000l +CAPTION "MakeNSISW" +MENU 104 +FONT 8, "MS Shell Dlg" +BEGIN + CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l | + 0x0100l | 0x0800l | 0x00008000 | + 0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190 + CONTROL "",-1,"Static",0x00000010l,7,220,346,1 + LTEXT "",200,7,230,200,12,0x08000000l + DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l + PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l +END diff --git a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc index 6ad56bc02d73..fd616520dbe1 100644 --- a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc +++ b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc @@ -1,44 +1,44 @@ -#include "windows.h" - -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US - -myaccelerators ACCELERATORS -{ - "^C", 999, VIRTKEY, ALT - "D", 1100, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -cursor BITMAP "cursor_small.bmp" -okay BITMAP "okay_small.bmp" - -14432 MENU -LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED -{ - MENUITEM "yu", 100 - MENUITEM "shala", 101 - MENUITEM "kaoya", 102 -} - -testdialog DIALOG 10, 10, 200, 300 -STYLE WS_POPUP | WS_BORDER -CAPTION "Test" -{ - CTEXT "Continue:", 1, 10, 10, 230, 14 - PUSHBUTTON "&OK", 2, 66, 134, 161, 13 -} - -12 ACCELERATORS -{ - "X", 164, VIRTKEY, ALT - "H", 5678, VIRTKEY, CONTROL, SHIFT - "^R", 444, ASCII, NOINVERT -} - -"eat" MENU -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS -{ - MENUITEM "fish", 100 - MENUITEM "salad", 101 - MENUITEM "duck", 102 -} +#include "windows.h" + +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US + +myaccelerators ACCELERATORS +{ + "^C", 999, VIRTKEY, ALT + "D", 1100, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +cursor BITMAP "cursor_small.bmp" +okay BITMAP "okay_small.bmp" + +14432 MENU +LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED +{ + MENUITEM "yu", 100 + MENUITEM "shala", 101 + MENUITEM "kaoya", 102 +} + +testdialog DIALOG 10, 10, 200, 300 +STYLE WS_POPUP | WS_BORDER +CAPTION "Test" +{ + CTEXT "Continue:", 1, 10, 10, 230, 14 + PUSHBUTTON "&OK", 2, 66, 134, 161, 13 +} + +12 ACCELERATORS +{ + "X", 164, VIRTKEY, ALT + "H", 5678, VIRTKEY, CONTROL, SHIFT + "^R", 444, ASCII, NOINVERT +} + +"eat" MENU +LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS +{ + MENUITEM "fish", 100 + MENUITEM "salad", 101 + MENUITEM "duck", 102 +} diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp index f77e7e39e14e..35107e50b32d 100644 --- a/llvm/unittests/Support/ModRefTest.cpp +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -1,27 +1,27 @@ -//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/ModRef.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/Support/raw_ostream.h" -#include "gtest/gtest.h" -#include - -using namespace llvm; - -namespace { - -// Verify that printing a MemoryEffects does not end with a ,. -TEST(ModRefTest, PrintMemoryEffects) { - std::string S; - raw_string_ostream OS(S); - OS << MemoryEffects::none(); - EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); -} - -} // namespace +//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ModRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +namespace { + +// Verify that printing a MemoryEffects does not end with a ,. +TEST(ModRefTest, PrintMemoryEffects) { + std::string S; + raw_string_ostream OS(S); + OS << MemoryEffects::none(); + EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); +} + +} // namespace diff --git a/llvm/utils/LLVMVisualizers/llvm.natvis b/llvm/utils/LLVMVisualizers/llvm.natvis index 03ca2d33a80b..d83ae8013c51 100644 --- a/llvm/utils/LLVMVisualizers/llvm.natvis +++ b/llvm/utils/LLVMVisualizers/llvm.natvis @@ -1,408 +1,408 @@ - - - - - empty - {(value_type*)BeginX,[Size]} - {Size} elements - Uninitialized - - Size - Capacity - - Size - (value_type*)BeginX - - - - - - {U.VAL} - Cannot visualize APInts longer than 64 bits - - - {Data,[Length]} - {Length} elements - Uninitialized - - Length - - Length - Data - - - - - {(const char*)BeginX,[Size]s8} - (const char*)BeginX,[Size] - - Size - Capacity - - Size - (char*)BeginX - - - - - - {First,[Last - First]s8} - - - - {Data,[Length]s8} - Data,[Length]s8 - - Length - - Length - Data - - - - - - {($T1)*(intptr_t *)Data} - - - - - - {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} - {($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)} - {$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)}] - - ($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask) - ($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask) - - - - - {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} - {((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)} - {$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)}] - - ($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask) - ((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask) - - - - - - {($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} - - - {($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} - - Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data>>$T2::InfoTy::IntShift) & $T2::InfoTy::IntMask} - - "$T4",s8b - - ($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) - - "$T5",s8b - - ($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) - - - - - - {{ empty }} - {{ head={Head} }} - - - Head - Next - this - - - - - - empty - RefPtr [1 ref] {*Obj} - RefPtr [{Obj->RefCount} refs] {*Obj} - - Obj->RefCount - Obj - - - - - {{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }} - {{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }} - - NumNonEmpty - CurArraySize - - NumNonEmpty - ($T1*)CurArray - - - - - - empty - {{ size={NumEntries}, buckets={NumBuckets} }} - - NumEntries - NumBuckets - - NumBuckets - Buckets - - - - - - {{ size={NumItems}, buckets={NumBuckets} }} - - NumItems - NumBuckets - - NumBuckets - (MapEntryTy**)TheTable - - - - - - empty - ({this+1,s8}, {second}) - - this+1,s - second - - - - - {Data} - - - - None - {Storage.value} - - Storage.value - - - - - Error - {*((storage_type *)TStorage.buffer)} - - *((storage_type *)TStorage.buffer) - *((error_type *)ErrorStorage.buffer) - - - - - - - {{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }} - - (unsigned char *)Value.buffer,1 - (unsigned char *)Value.buffer,2 - (unsigned char *)Value.buffer,4 - (unsigned char *)Value.buffer,8 - - - - - - {{ big endian value = {*(unsigned char *)Value.buffer} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 8) - | ($T1)(*((unsigned char *)Value.buffer+1))} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 24) - | (($T1)(*((unsigned char *)Value.buffer+1)) << 16) - | (($T1)(*((unsigned char *)Value.buffer+2)) << 8) - | ($T1)(*((unsigned char *)Value.buffer+3))} }} - {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 56) - | (($T1)(*((unsigned char *)Value.buffer+1)) << 48) - | (($T1)(*((unsigned char *)Value.buffer+2)) << 40) - | (($T1)(*((unsigned char *)Value.buffer+3)) << 32) - | (($T1)(*((unsigned char *)Value.buffer+4)) << 24) - | (($T1)(*((unsigned char *)Value.buffer+5)) << 16) - | (($T1)(*((unsigned char *)Value.buffer+6)) << 8) - | ($T1)(*((unsigned char *)Value.buffer+7))} }} - - (unsigned char *)Value.buffer,1 - (unsigned char *)Value.buffer,2 - (unsigned char *)Value.buffer,4 - (unsigned char *)Value.buffer,8 - - - - - {ID} - - ID - - SubclassData - - *ContainedTys - - {NumContainedTys - 1} - - - NumContainedTys - 1 - ContainedTys + 1 - - - - SubclassData == 1 - - (SubclassData & llvm::StructType::SCDB_HasBody) != 0 - (SubclassData & llvm::StructType::SCDB_Packed) != 0 - (SubclassData & llvm::StructType::SCDB_IsLiteral) != 0 - (SubclassData & llvm::StructType::SCDB_IsSized) != 0 - - {NumContainedTys} - - - NumContainedTys - ContainedTys - - - - - *ContainedTys - ((llvm::ArrayType*)this)->NumElements - - *ContainedTys - ((llvm::VectorType*)this)->ElementQuantity - - *ContainedTys - ((llvm::VectorType*)this)->ElementQuantity - - SubclassData - *ContainedTys - - Context - - - - - $(Type) {*Value} - - - - $(Type) {(llvm::ISD::NodeType)this->NodeType} - - - NumOperands - OperandList - - - - - - i{Val.BitWidth} {Val.VAL} - - - - {IDAndSubclassData >> 8}bit integer type - - - - $(Type) {*VTy} {this->getName()} {SubclassData} - $(Type) {*VTy} anon {SubclassData} - - (Instruction*)this - (User*)this - - UseList - Next - Prev.Value & 3 == 3 ? (User*)(this + 1) : (User*)(this + 2) - - - - - - - Val - - - - - - - $(Type) {*VTy} {this->getName()} {SubclassData} - $(Type) {*VTy} anon {SubclassData} - - (Value*)this,nd - *VTy - - NumUserOperands - (llvm::Use*)this - NumUserOperands - - - NumUserOperands - *((llvm::Use**)this - 1) - - - - - - {getOpcodeName(SubclassID - InstructionVal)} - - (User*)this,nd - - - - - {this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal} - - - - - - - this - Next - this - - - - - - - pImpl - - - - - {ModuleID,s8} {TargetTriple} - - - - $(Type) {PassID} {Kind} - - + + + + + empty + {(value_type*)BeginX,[Size]} + {Size} elements + Uninitialized + + Size + Capacity + + Size + (value_type*)BeginX + + + + + + {U.VAL} + Cannot visualize APInts longer than 64 bits + + + {Data,[Length]} + {Length} elements + Uninitialized + + Length + + Length + Data + + + + + {(const char*)BeginX,[Size]s8} + (const char*)BeginX,[Size] + + Size + Capacity + + Size + (char*)BeginX + + + + + + {First,[Last - First]s8} + + + + {Data,[Length]s8} + Data,[Length]s8 + + Length + + Length + Data + + + + + + {($T1)*(intptr_t *)Data} + + + + + + {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} + {($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)} + {$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask)}] + + ($T1)(*(intptr_t *)Value.Data & $T6::PointerBitMask) + ($T4)((*(intptr_t *)Value.Data >> $T6::IntShift) & $T6::IntMask) + + + + + {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} + {((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)} + {$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask)}] + + ($T1)(*(intptr_t *)Value.Data & $T5::PointerBitMask) + ((*(intptr_t *)Value.Data >> $T5::IntShift) & $T5::IntMask) + + + + + + {($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} + + + {($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask)} + + Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data>>$T2::InfoTy::IntShift) & $T2::InfoTy::IntMask} + + "$T4",s8b + + ($T4)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) + + "$T5",s8b + + ($T5)(*(intptr_t *)Val.Value.Data & $T2::InfoTy::PointerBitMask) + + + + + + {{ empty }} + {{ head={Head} }} + + + Head + Next + this + + + + + + empty + RefPtr [1 ref] {*Obj} + RefPtr [{Obj->RefCount} refs] {*Obj} + + Obj->RefCount + Obj + + + + + {{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }} + {{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }} + + NumNonEmpty + CurArraySize + + NumNonEmpty + ($T1*)CurArray + + + + + + empty + {{ size={NumEntries}, buckets={NumBuckets} }} + + NumEntries + NumBuckets + + NumBuckets + Buckets + + + + + + {{ size={NumItems}, buckets={NumBuckets} }} + + NumItems + NumBuckets + + NumBuckets + (MapEntryTy**)TheTable + + + + + + empty + ({this+1,s8}, {second}) + + this+1,s + second + + + + + {Data} + + + + None + {Storage.value} + + Storage.value + + + + + Error + {*((storage_type *)TStorage.buffer)} + + *((storage_type *)TStorage.buffer) + *((error_type *)ErrorStorage.buffer) + + + + + + + {{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }} + + (unsigned char *)Value.buffer,1 + (unsigned char *)Value.buffer,2 + (unsigned char *)Value.buffer,4 + (unsigned char *)Value.buffer,8 + + + + + + {{ big endian value = {*(unsigned char *)Value.buffer} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 8) + | ($T1)(*((unsigned char *)Value.buffer+1))} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 24) + | (($T1)(*((unsigned char *)Value.buffer+1)) << 16) + | (($T1)(*((unsigned char *)Value.buffer+2)) << 8) + | ($T1)(*((unsigned char *)Value.buffer+3))} }} + {{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) << 56) + | (($T1)(*((unsigned char *)Value.buffer+1)) << 48) + | (($T1)(*((unsigned char *)Value.buffer+2)) << 40) + | (($T1)(*((unsigned char *)Value.buffer+3)) << 32) + | (($T1)(*((unsigned char *)Value.buffer+4)) << 24) + | (($T1)(*((unsigned char *)Value.buffer+5)) << 16) + | (($T1)(*((unsigned char *)Value.buffer+6)) << 8) + | ($T1)(*((unsigned char *)Value.buffer+7))} }} + + (unsigned char *)Value.buffer,1 + (unsigned char *)Value.buffer,2 + (unsigned char *)Value.buffer,4 + (unsigned char *)Value.buffer,8 + + + + + {ID} + + ID + + SubclassData + + *ContainedTys + + {NumContainedTys - 1} + + + NumContainedTys - 1 + ContainedTys + 1 + + + + SubclassData == 1 + + (SubclassData & llvm::StructType::SCDB_HasBody) != 0 + (SubclassData & llvm::StructType::SCDB_Packed) != 0 + (SubclassData & llvm::StructType::SCDB_IsLiteral) != 0 + (SubclassData & llvm::StructType::SCDB_IsSized) != 0 + + {NumContainedTys} + + + NumContainedTys + ContainedTys + + + + + *ContainedTys + ((llvm::ArrayType*)this)->NumElements + + *ContainedTys + ((llvm::VectorType*)this)->ElementQuantity + + *ContainedTys + ((llvm::VectorType*)this)->ElementQuantity + + SubclassData + *ContainedTys + + Context + + + + + $(Type) {*Value} + + + + $(Type) {(llvm::ISD::NodeType)this->NodeType} + + + NumOperands + OperandList + + + + + + i{Val.BitWidth} {Val.VAL} + + + + {IDAndSubclassData >> 8}bit integer type + + + + $(Type) {*VTy} {this->getName()} {SubclassData} + $(Type) {*VTy} anon {SubclassData} + + (Instruction*)this + (User*)this + + UseList + Next + Prev.Value & 3 == 3 ? (User*)(this + 1) : (User*)(this + 2) + + + + + + + Val + + + + + + + $(Type) {*VTy} {this->getName()} {SubclassData} + $(Type) {*VTy} anon {SubclassData} + + (Value*)this,nd + *VTy + + NumUserOperands + (llvm::Use*)this - NumUserOperands + + + NumUserOperands + *((llvm::Use**)this - 1) + + + + + + {getOpcodeName(SubclassID - InstructionVal)} + + (User*)this,nd + + + + + {this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal} + + + + + + + this + Next + this + + + + + + + pImpl + + + + + {ModuleID,s8} {TargetTriple} + + + + $(Type) {PassID} {Kind} + + diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos index 0f25621c787e..7a0560654c5c 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos @@ -1,3 +1,3 @@ -In this file, the -sequence "\r\n" -terminates lines. +In this file, the +sequence "\r\n" +terminates lines. diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index 3718673ae7a2..dd041d7d384e 100755 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -1,515 +1,515 @@ -@echo off -setlocal enabledelayedexpansion - -goto begin - -:usage -echo Script for building the LLVM installer on Windows, -echo used for the releases at https://github.com/llvm/llvm-project/releases -echo. -echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] -echo. -echo Options: -echo --version: [required] version to build -echo --help: display this help -echo --x86: build and test x86 variant -echo --x64: build and test x64 variant -echo --arm64: build and test arm64 variant -echo --skip-checkout: use local git checkout instead of downloading src.zip -echo --local-python: use installed Python and does not try to use a specific version (3.10) -echo. -echo Note: At least one variant to build is required. -echo. -echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 -exit /b 1 - -:begin - -::============================================================================== -:: parse args -set version= -set help= -set x86= -set x64= -set arm64= -set skip-checkout= -set local-python= -call :parse_args %* - -if "%help%" NEQ "" goto usage - -if "%version%" == "" ( - echo --version option is required - echo ============================= - goto usage -) - -if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" ( - echo nothing to build! - echo choose one or several variants from: --x86 --x64 --arm64 - exit /b 1 -) - -::============================================================================== -:: check prerequisites -REM Note: -REM 7zip versions 21.x and higher will try to extract the symlinks in -REM llvm's git archive, which requires running as administrator. - -REM Check 7-zip version and/or administrator permissions. -for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i -if not "%version_7z%"=="" ( - REM Unique temporary filename to use by the 'mklink' command. - set "link_name=%temp%\%username%_%random%_%random%.tmp" - - REM As the 'mklink' requires elevated permissions, the symbolic link - REM creation will fail if the script is not running as administrator. - mklink /d "!link_name!" . 1>nul 2>nul - if errorlevel 1 ( - echo. - echo Script requires administrator permissions, or a 7-zip version 20.x or older. - echo Current version is "%version_7z%" - exit /b 1 - ) else ( - REM Remove the temporary symbolic link. - rd "!link_name!" - ) -) - -REM Prerequisites: -REM -REM Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3, -REM NSIS with the strlen_8192 patch, -REM Perl (for the OpenMP run-time). -REM -REM -REM For LLDB, SWIG version 4.1.1 should be used. -REM - -:: Detect Visual Studio -set vsinstall= -set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe - -if "%VSINSTALLDIR%" NEQ "" ( - echo using enabled Visual Studio installation - set "vsinstall=%VSINSTALLDIR%" -) else ( - echo using vswhere to detect Visual Studio installation - FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r -) -set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat" - -if not exist "%vsdevcmd%" ( - echo Can't find any installation of Visual Studio - exit /b 1 -) -echo Using VS devcmd: %vsdevcmd% - -::============================================================================== -:: start echoing what we do -@echo on - -set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32 -set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310 -set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64 - -set revision=llvmorg-%version% -set package_version=%version% -set build_dir=%cd%\llvm_package_%package_version% - -echo Revision: %revision% -echo Package version: %package_version% -echo Build dir: %build_dir% -echo. - -if exist %build_dir% ( - echo Build directory already exists: %build_dir% - exit /b 1 -) -mkdir %build_dir% -cd %build_dir% || exit /b 1 - -if "%skip-checkout%" == "true" ( - echo Using local source - set llvm_src=%~dp0..\..\.. -) else ( - echo Checking out %revision% - curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1 - 7z x src.zip || exit /b 1 - mv llvm-project-* llvm-project || exit /b 1 - set llvm_src=%build_dir%\llvm-project -) - -curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1 -tar zxf libxml2-v2.9.12.tar.gz - -REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226. -REM Common flags for all builds. -set common_compiler_flags=-DLIBXML_STATIC -set common_cmake_flags=^ - -DCMAKE_BUILD_TYPE=Release ^ - -DLLVM_ENABLE_ASSERTIONS=OFF ^ - -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^ - -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^ - -DLLVM_BUILD_LLVM_C_DYLIB=ON ^ - -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^ - -DPython3_FIND_REGISTRY=NEVER ^ - -DPACKAGE_VERSION=%package_version% ^ - -DLLDB_RELOCATABLE_PYTHON=1 ^ - -DLLDB_EMBED_PYTHON_HOME=OFF ^ - -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^ - -DLLVM_ENABLE_LIBXML2=FORCE_ON ^ - -DLLDB_ENABLE_LIBXML2=OFF ^ - -DCLANG_ENABLE_LIBXML2=OFF ^ - -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ - -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ - -DLLVM_ENABLE_RPMALLOC=ON ^ - -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp" - -set cmake_profile_flags="" - -REM Preserve original path -set OLDPATH=%PATH% - -REM Build the 32-bits and/or 64-bits binaries. -if "%x86%" == "true" call :do_build_32 || exit /b 1 -if "%x64%" == "true" call :do_build_64 || exit /b 1 -if "%arm64%" == "true" call :do_build_arm64 || exit /b 1 -exit /b 0 - -::============================================================================== -:: Build 32-bits binaries. -::============================================================================== -:do_build_32 -call :set_environment %python32_dir% || exit /b 1 -call "%vsdevcmd%" -arch=x86 || exit /b 1 -@echo on -mkdir build32_stage0 -cd build32_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build32_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DLLVM_ENABLE_RPMALLOC=OFF ^ - -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ - -DPYTHON_HOME=%PYTHONHOME% ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib - -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe -set cmake_flags=%all_cmake_flags:\=/% - -mkdir build32 -cd build32 -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja package || exit /b 1 -cd .. - -exit /b 0 -::============================================================================== - -::============================================================================== -:: Build 64-bits binaries. -::============================================================================== -:do_build_64 -call :set_environment %python64_dir% || exit /b 1 -call "%vsdevcmd%" -arch=amd64 || exit /b 1 -@echo on -mkdir build64_stage0 -cd build64_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build64_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ - -DPYTHON_HOME=%PYTHONHOME% ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib - -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe -set cmake_flags=%all_cmake_flags:\=/% - - -mkdir build64 -cd build64 -call :do_generate_profile || exit /b 1 -cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1 -ninja || ninja || ninja || exit /b 1 -ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 -ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 -ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 -ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 -ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 -ninja package || exit /b 1 - -:: generate tarball with install toolchain only off -set filename=clang+llvm-%version%-x86_64-pc-windows-msvc -cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^ - -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1 -ninja install || exit /b 1 -:: check llvm_config is present & returns something -%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1 -cd .. -7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz - -exit /b 0 -::============================================================================== - -::============================================================================== -:: Build arm64 binaries. -::============================================================================== -:do_build_arm64 -call :set_environment %pythonarm64_dir% || exit /b 1 -call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1 -@echo on -mkdir build_arm64_stage0 -cd build_arm64_stage0 -call :do_build_libxml || exit /b 1 - -REM Stage0 binaries directory; used in stage1. -set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin" -set cmake_flags=^ - %common_cmake_flags% ^ - -DCLANG_DEFAULT_LINKER=lld ^ - -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ - -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^ - -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DCOMPILER_RT_BUILD_PROFILE=OFF ^ - -DCOMPILER_RT_BUILD_SANITIZERS=OFF - -REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins). -cmake -GNinja %cmake_flags% ^ - -DCMAKE_C_COMPILER=clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=clang-cl.exe ^ - %llvm_src%\llvm || exit /b 1 -ninja || exit /b 1 -::ninja check-llvm || exit /b 1 -::ninja check-clang || exit /b 1 -::ninja check-lld || exit /b 1 -::ninja check-sanitizer || exit /b 1 -::ninja check-clang-tools || exit /b 1 -::ninja check-clangd || exit /b 1 -cd.. - -REM CMake expects the paths that specifies the compiler and linker to be -REM with forward slash. -REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated. -set all_cmake_flags=^ - %cmake_flags% ^ - -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ - -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ - -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ - -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^ - -DCPACK_SYSTEM_NAME=woa64 -set cmake_flags=%all_cmake_flags:\=/% - -mkdir build_arm64 -cd build_arm64 -cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 -ninja || exit /b 1 -REM Check but do not fail on errors. -ninja check-lldb -::ninja check-llvm || exit /b 1 -::ninja check-clang || exit /b 1 -::ninja check-lld || exit /b 1 -::ninja check-sanitizer || exit /b 1 -::ninja check-clang-tools || exit /b 1 -::ninja check-clangd || exit /b 1 -ninja package || exit /b 1 -cd .. - -exit /b 0 -::============================================================================== -:: -::============================================================================== -:: Set PATH and some environment variables. -::============================================================================== -:set_environment -REM Restore original path -set PATH=%OLDPATH% - -set python_dir=%1 - -REM Set Python environment -if "%local-python%" == "true" ( - FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i - set PYTHONHOME=!python_exe:~0,-11! -) else ( - %python_dir%/python.exe --version || exit /b 1 - set PYTHONHOME=%python_dir% -) -set PATH=%PYTHONHOME%;%PATH% - -set "VSCMD_START_DIR=%build_dir%" - -exit /b 0 - -::============================================================================= - -::============================================================================== -:: Build libxml. -::============================================================================== -:do_build_libxml -mkdir libxmlbuild -cd libxmlbuild -cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^ - -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^ - -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^ - -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^ - -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^ - -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^ - -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^ - -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^ - -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^ - -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^ - -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^ - -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^ - -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^ - -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^ - ../../libxml2-v2.9.12 || exit /b 1 -ninja install || exit /b 1 -set libxmldir=%cd%\install -set "libxmldir=%libxmldir:\=/%" -cd .. -exit /b 0 - -::============================================================================== -:: Generate a PGO profile. -::============================================================================== -:do_generate_profile -REM Build Clang with instrumentation. -mkdir instrument -cd instrument -cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^ - -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1 -ninja clang || ninja clang || ninja clang || exit /b 1 -set instrumented_clang=%cd:\=/%/bin/clang-cl.exe -cd .. -REM Use that to build part of llvm to generate a profile. -mkdir train -cd train -cmake -GNinja %cmake_flags% ^ - -DCMAKE_C_COMPILER=%instrumented_clang% ^ - -DCMAKE_CXX_COMPILER=%instrumented_clang% ^ - -DLLVM_ENABLE_PROJECTS=clang ^ - -DLLVM_TARGETS_TO_BUILD=Native ^ - %llvm_src%\llvm || exit /b 1 -REM Drop profiles generated from running cmake; those are not representative. -del ..\instrument\profiles\*.profraw -ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj -cd .. -set profile=%cd:\=/%/profile.profdata -%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1 -set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin -set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^ - -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ - -DCMAKE_CXX_FLAGS="%common_compiler_flags%" -exit /b 0 - -::============================================================================= -:: Parse command line arguments. -:: The format for the arguments is: -:: Boolean: --option -:: Value: --optionvalue -:: with being: space, colon, semicolon or equal sign -:: -:: Command line usage example: -:: my-batch-file.bat --build --type=release --version 123 -:: It will create 3 variables: -:: 'build' with the value 'true' -:: 'type' with the value 'release' -:: 'version' with the value '123' -:: -:: Usage: -:: set "build=" -:: set "type=" -:: set "version=" -:: -:: REM Parse arguments. -:: call :parse_args %* -:: -:: if defined build ( -:: ... -:: ) -:: if %type%=='release' ( -:: ... -:: ) -:: if %version%=='123' ( -:: ... -:: ) -::============================================================================= -:parse_args - set "arg_name=" - :parse_args_start - if "%1" == "" ( - :: Set a seen boolean argument. - if "%arg_name%" neq "" ( - set "%arg_name%=true" - ) - goto :parse_args_done - ) - set aux=%1 - if "%aux:~0,2%" == "--" ( - :: Set a seen boolean argument. - if "%arg_name%" neq "" ( - set "%arg_name%=true" - ) - set "arg_name=%aux:~2,250%" - ) else ( - set "%arg_name%=%1" - set "arg_name=" - ) - shift - goto :parse_args_start - -:parse_args_done -exit /b 0 +@echo off +setlocal enabledelayedexpansion + +goto begin + +:usage +echo Script for building the LLVM installer on Windows, +echo used for the releases at https://github.com/llvm/llvm-project/releases +echo. +echo Usage: build_llvm_release.bat --version ^ [--x86,--x64, --arm64] [--skip-checkout] [--local-python] +echo. +echo Options: +echo --version: [required] version to build +echo --help: display this help +echo --x86: build and test x86 variant +echo --x64: build and test x64 variant +echo --arm64: build and test arm64 variant +echo --skip-checkout: use local git checkout instead of downloading src.zip +echo --local-python: use installed Python and does not try to use a specific version (3.10) +echo. +echo Note: At least one variant to build is required. +echo. +echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 +exit /b 1 + +:begin + +::============================================================================== +:: parse args +set version= +set help= +set x86= +set x64= +set arm64= +set skip-checkout= +set local-python= +call :parse_args %* + +if "%help%" NEQ "" goto usage + +if "%version%" == "" ( + echo --version option is required + echo ============================= + goto usage +) + +if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" ( + echo nothing to build! + echo choose one or several variants from: --x86 --x64 --arm64 + exit /b 1 +) + +::============================================================================== +:: check prerequisites +REM Note: +REM 7zip versions 21.x and higher will try to extract the symlinks in +REM llvm's git archive, which requires running as administrator. + +REM Check 7-zip version and/or administrator permissions. +for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i +if not "%version_7z%"=="" ( + REM Unique temporary filename to use by the 'mklink' command. + set "link_name=%temp%\%username%_%random%_%random%.tmp" + + REM As the 'mklink' requires elevated permissions, the symbolic link + REM creation will fail if the script is not running as administrator. + mklink /d "!link_name!" . 1>nul 2>nul + if errorlevel 1 ( + echo. + echo Script requires administrator permissions, or a 7-zip version 20.x or older. + echo Current version is "%version_7z%" + exit /b 1 + ) else ( + REM Remove the temporary symbolic link. + rd "!link_name!" + ) +) + +REM Prerequisites: +REM +REM Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3, +REM NSIS with the strlen_8192 patch, +REM Perl (for the OpenMP run-time). +REM +REM +REM For LLDB, SWIG version 4.1.1 should be used. +REM + +:: Detect Visual Studio +set vsinstall= +set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe + +if "%VSINSTALLDIR%" NEQ "" ( + echo using enabled Visual Studio installation + set "vsinstall=%VSINSTALLDIR%" +) else ( + echo using vswhere to detect Visual Studio installation + FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r +) +set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat" + +if not exist "%vsdevcmd%" ( + echo Can't find any installation of Visual Studio + exit /b 1 +) +echo Using VS devcmd: %vsdevcmd% + +::============================================================================== +:: start echoing what we do +@echo on + +set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32 +set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310 +set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64 + +set revision=llvmorg-%version% +set package_version=%version% +set build_dir=%cd%\llvm_package_%package_version% + +echo Revision: %revision% +echo Package version: %package_version% +echo Build dir: %build_dir% +echo. + +if exist %build_dir% ( + echo Build directory already exists: %build_dir% + exit /b 1 +) +mkdir %build_dir% +cd %build_dir% || exit /b 1 + +if "%skip-checkout%" == "true" ( + echo Using local source + set llvm_src=%~dp0..\..\.. +) else ( + echo Checking out %revision% + curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1 + 7z x src.zip || exit /b 1 + mv llvm-project-* llvm-project || exit /b 1 + set llvm_src=%build_dir%\llvm-project +) + +curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1 +tar zxf libxml2-v2.9.12.tar.gz + +REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226. +REM Common flags for all builds. +set common_compiler_flags=-DLIBXML_STATIC +set common_cmake_flags=^ + -DCMAKE_BUILD_TYPE=Release ^ + -DLLVM_ENABLE_ASSERTIONS=OFF ^ + -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^ + -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^ + -DLLVM_BUILD_LLVM_C_DYLIB=ON ^ + -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^ + -DPython3_FIND_REGISTRY=NEVER ^ + -DPACKAGE_VERSION=%package_version% ^ + -DLLDB_RELOCATABLE_PYTHON=1 ^ + -DLLDB_EMBED_PYTHON_HOME=OFF ^ + -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^ + -DLLVM_ENABLE_LIBXML2=FORCE_ON ^ + -DLLDB_ENABLE_LIBXML2=OFF ^ + -DCLANG_ENABLE_LIBXML2=OFF ^ + -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ + -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ + -DLLVM_ENABLE_RPMALLOC=ON ^ + -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp" + +set cmake_profile_flags="" + +REM Preserve original path +set OLDPATH=%PATH% + +REM Build the 32-bits and/or 64-bits binaries. +if "%x86%" == "true" call :do_build_32 || exit /b 1 +if "%x64%" == "true" call :do_build_64 || exit /b 1 +if "%arm64%" == "true" call :do_build_arm64 || exit /b 1 +exit /b 0 + +::============================================================================== +:: Build 32-bits binaries. +::============================================================================== +:do_build_32 +call :set_environment %python32_dir% || exit /b 1 +call "%vsdevcmd%" -arch=x86 || exit /b 1 +@echo on +mkdir build32_stage0 +cd build32_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build32_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DLLVM_ENABLE_RPMALLOC=OFF ^ + -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ + -DPYTHON_HOME=%PYTHONHOME% ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib + +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe +set cmake_flags=%all_cmake_flags:\=/% + +mkdir build32 +cd build32 +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja package || exit /b 1 +cd .. + +exit /b 0 +::============================================================================== + +::============================================================================== +:: Build 64-bits binaries. +::============================================================================== +:do_build_64 +call :set_environment %python64_dir% || exit /b 1 +call "%vsdevcmd%" -arch=amd64 || exit /b 1 +@echo on +mkdir build64_stage0 +cd build64_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build64_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ + -DPYTHON_HOME=%PYTHONHOME% ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib + +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe +set cmake_flags=%all_cmake_flags:\=/% + + +mkdir build64 +cd build64 +call :do_generate_profile || exit /b 1 +cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1 +ninja || ninja || ninja || exit /b 1 +ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 +ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 +ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 +ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1 +ninja package || exit /b 1 + +:: generate tarball with install toolchain only off +set filename=clang+llvm-%version%-x86_64-pc-windows-msvc +cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^ + -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1 +ninja install || exit /b 1 +:: check llvm_config is present & returns something +%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1 +cd .. +7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz + +exit /b 0 +::============================================================================== + +::============================================================================== +:: Build arm64 binaries. +::============================================================================== +:do_build_arm64 +call :set_environment %pythonarm64_dir% || exit /b 1 +call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1 +@echo on +mkdir build_arm64_stage0 +cd build_arm64_stage0 +call :do_build_libxml || exit /b 1 + +REM Stage0 binaries directory; used in stage1. +set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin" +set cmake_flags=^ + %common_cmake_flags% ^ + -DCLANG_DEFAULT_LINKER=lld ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ + -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^ + -DPython3_ROOT_DIR=%PYTHONHOME% ^ + -DCOMPILER_RT_BUILD_PROFILE=OFF ^ + -DCOMPILER_RT_BUILD_SANITIZERS=OFF + +REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins). +cmake -GNinja %cmake_flags% ^ + -DCMAKE_C_COMPILER=clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=clang-cl.exe ^ + %llvm_src%\llvm || exit /b 1 +ninja || exit /b 1 +::ninja check-llvm || exit /b 1 +::ninja check-clang || exit /b 1 +::ninja check-lld || exit /b 1 +::ninja check-sanitizer || exit /b 1 +::ninja check-clang-tools || exit /b 1 +::ninja check-clangd || exit /b 1 +cd.. + +REM CMake expects the paths that specifies the compiler and linker to be +REM with forward slash. +REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated. +set all_cmake_flags=^ + %cmake_flags% ^ + -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ + -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^ + -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^ + -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^ + -DCPACK_SYSTEM_NAME=woa64 +set cmake_flags=%all_cmake_flags:\=/% + +mkdir build_arm64 +cd build_arm64 +cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 +ninja || exit /b 1 +REM Check but do not fail on errors. +ninja check-lldb +::ninja check-llvm || exit /b 1 +::ninja check-clang || exit /b 1 +::ninja check-lld || exit /b 1 +::ninja check-sanitizer || exit /b 1 +::ninja check-clang-tools || exit /b 1 +::ninja check-clangd || exit /b 1 +ninja package || exit /b 1 +cd .. + +exit /b 0 +::============================================================================== +:: +::============================================================================== +:: Set PATH and some environment variables. +::============================================================================== +:set_environment +REM Restore original path +set PATH=%OLDPATH% + +set python_dir=%1 + +REM Set Python environment +if "%local-python%" == "true" ( + FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i + set PYTHONHOME=!python_exe:~0,-11! +) else ( + %python_dir%/python.exe --version || exit /b 1 + set PYTHONHOME=%python_dir% +) +set PATH=%PYTHONHOME%;%PATH% + +set "VSCMD_START_DIR=%build_dir%" + +exit /b 0 + +::============================================================================= + +::============================================================================== +:: Build libxml. +::============================================================================== +:do_build_libxml +mkdir libxmlbuild +cd libxmlbuild +cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^ + -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^ + -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^ + -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^ + -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^ + -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^ + -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^ + -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^ + -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^ + -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^ + -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^ + -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^ + -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^ + -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^ + ../../libxml2-v2.9.12 || exit /b 1 +ninja install || exit /b 1 +set libxmldir=%cd%\install +set "libxmldir=%libxmldir:\=/%" +cd .. +exit /b 0 + +::============================================================================== +:: Generate a PGO profile. +::============================================================================== +:do_generate_profile +REM Build Clang with instrumentation. +mkdir instrument +cd instrument +cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^ + -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1 +ninja clang || ninja clang || ninja clang || exit /b 1 +set instrumented_clang=%cd:\=/%/bin/clang-cl.exe +cd .. +REM Use that to build part of llvm to generate a profile. +mkdir train +cd train +cmake -GNinja %cmake_flags% ^ + -DCMAKE_C_COMPILER=%instrumented_clang% ^ + -DCMAKE_CXX_COMPILER=%instrumented_clang% ^ + -DLLVM_ENABLE_PROJECTS=clang ^ + -DLLVM_TARGETS_TO_BUILD=Native ^ + %llvm_src%\llvm || exit /b 1 +REM Drop profiles generated from running cmake; those are not representative. +del ..\instrument\profiles\*.profraw +ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj +cd .. +set profile=%cd:\=/%/profile.profdata +%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1 +set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin +set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^ + -DCMAKE_C_FLAGS="%common_compiler_flags%" ^ + -DCMAKE_CXX_FLAGS="%common_compiler_flags%" +exit /b 0 + +::============================================================================= +:: Parse command line arguments. +:: The format for the arguments is: +:: Boolean: --option +:: Value: --optionvalue +:: with being: space, colon, semicolon or equal sign +:: +:: Command line usage example: +:: my-batch-file.bat --build --type=release --version 123 +:: It will create 3 variables: +:: 'build' with the value 'true' +:: 'type' with the value 'release' +:: 'version' with the value '123' +:: +:: Usage: +:: set "build=" +:: set "type=" +:: set "version=" +:: +:: REM Parse arguments. +:: call :parse_args %* +:: +:: if defined build ( +:: ... +:: ) +:: if %type%=='release' ( +:: ... +:: ) +:: if %version%=='123' ( +:: ... +:: ) +::============================================================================= +:parse_args + set "arg_name=" + :parse_args_start + if "%1" == "" ( + :: Set a seen boolean argument. + if "%arg_name%" neq "" ( + set "%arg_name%=true" + ) + goto :parse_args_done + ) + set aux=%1 + if "%aux:~0,2%" == "--" ( + :: Set a seen boolean argument. + if "%arg_name%" neq "" ( + set "%arg_name%=true" + ) + set "arg_name=%aux:~2,250%" + ) else ( + set "%arg_name%=%1" + set "arg_name=" + ) + shift + goto :parse_args_start + +:parse_args_done +exit /b 0 diff --git a/openmp/runtime/doc/doxygen/config b/openmp/runtime/doc/doxygen/config index 8d79dc143cc1..04c966766ba6 100644 --- a/openmp/runtime/doc/doxygen/config +++ b/openmp/runtime/doc/doxygen/config @@ -1,1822 +1,1822 @@ -# Doxyfile 1.o8.2 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (" "). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# http://www.gnu.org/software/libiconv for the list of possible encodings. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or sequence of words) that should -# identify the project. Note that if you do not use Doxywizard you need -# to put quotes around the project name if it contains spaces. - -PROJECT_NAME = "LLVM OpenMP* Runtime Library" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. -# This could be handy for archiving the generated documentation or -# if some version control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer -# a quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify an logo or icon that is -# included in the documentation. The maximum height of the logo should not -# exceed 55 pixels and the maximum width should not exceed 200 pixels. -# Doxygen will copy the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) -# base path where the generated documentation will be put. -# If a relative path is entered, it will be relative to the location -# where doxygen was started. If left blank the current directory will be used. - -OUTPUT_DIRECTORY = doc/doxygen/generated - -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create -# 4096 sub-directories (in 2 levels) under the output directory of each output -# format and will distribute the generated files over these directories. -# Enabling this option can be useful when feeding doxygen a huge amount of -# source files, where putting all generated files in the same directory would -# otherwise cause performance problems for the file system. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# The default language is English, other supported languages are: -# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, -# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, -# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English -# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, -# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, -# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will -# include brief member descriptions after the members that are listed in -# the file and class documentation (similar to JavaDoc). -# Set to NO to disable this. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend -# the brief description of a member or function before the detailed description. -# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator -# that is used to form the text in various listings. Each string -# in this list, if found as the leading text of the brief description, will be -# stripped from the text and the result after processing the whole list, is -# used as the annotated text. Otherwise, the brief description is used as-is. -# If left blank, the following values are used ("$name" is automatically -# replaced with the name of the entity): "The $name class" "The $name widget" -# "The $name file" "is" "provides" "specifies" "contains" -# "represents" "a" "an" "the" - -ABBREVIATE_BRIEF = - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# Doxygen will generate a detailed section even if there is only a brief -# description. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full -# path before files name in the file list and in the header files. If set -# to NO the shortest path that makes the file name unique will be used. - -FULL_PATH_NAMES = NO - -# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag -# can be used to strip a user-defined part of the path. Stripping is -# only done if one of the specified strings matches the left-hand part of -# the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the -# path to strip. Note that you specify absolute paths here, but also -# relative paths, which will be relative from the directory where doxygen is -# started. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of -# the path mentioned in the documentation of a class, which tells -# the reader which header file to include in order to use a class. -# If left blank only the name of the header file containing the class -# definition is used. Otherwise one should specify the include paths that -# are normally passed to the compiler using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter -# (but less readable) file names. This can be useful if your file system -# doesn't support long names like on DOS, Mac, or CD-ROM. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen -# will interpret the first line (until the first dot) of a JavaDoc-style -# comment as the brief description. If set to NO, the JavaDoc -# comments will behave just like regular Qt-style comments -# (thus requiring an explicit @brief command for a brief description.) - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then Doxygen will -# interpret the first line (until the first dot) of a Qt-style -# comment as the brief description. If set to NO, the comments -# will behave just like regular Qt-style comments (thus requiring -# an explicit \brief command for a brief description.) - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen -# treat a multi-line C++ special comment block (i.e. a block of //! or /// -# comments) as a brief description. This used to be the default behaviour. -# The new default is to treat a multi-line C++ comment block as a detailed -# description. Set this tag to YES if you prefer the old behaviour instead. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented -# member inherits the documentation from any documented member that it -# re-implements. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce -# a new page for each member. If set to NO, the documentation of a member will -# be part of the file/class/namespace that contains it. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. -# Doxygen uses this value to replace tabs by spaces in code fragments. - -TAB_SIZE = 8 - -# This tag can be used to specify a number of aliases that acts -# as commands in the documentation. An alias has the form "name=value". -# For example adding "sideeffect=\par Side Effects:\n" will allow you to -# put the command \sideeffect (or @sideeffect) in the documentation, which -# will result in a user-defined paragraph with heading "Side Effects:". -# You can put \n's in the value part of an alias to insert newlines. - -ALIASES = "other=*" - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding -# "class=itcl::class" will allow you to use the command class in the -# itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C -# sources only. Doxygen will then generate output that is more tailored for C. -# For instance, some of the names that are used will be different. The list -# of all members will be omitted, etc. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java -# sources only. Doxygen will then generate output that is more tailored for -# Java. For instance, namespaces will be presented as packages, qualified -# scopes will look different, etc. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources only. Doxygen will then generate output that is more tailored for -# Fortran. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for -# VHDL. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, -# and language is one of the parsers supported by doxygen: IDL, Java, -# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, -# C++. For instance to make doxygen treat .inc files as Fortran files (default -# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note -# that for custom extensions you also need to set FILE_PATTERNS otherwise the -# files are not read by doxygen. - -EXTENSION_MAPPING = - -# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all -# comments according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you -# can mix doxygen, HTML, and XML commands with Markdown formatting. -# Disable only in case of backward compatibilities issues. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented classes, -# or namespaces to their corresponding documentation. Such a link can be -# prevented in individual cases by by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should -# set this tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. -# func(std::string) {}). This also makes the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. -# Doxygen will parse them like normal C++ but will assume all classes use public -# instead of private inheritance when no explicit protection keyword is present. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to -# indicate getter and setter methods for a property. Setting this -# option to YES (the default) will make doxygen replace the get and -# set methods by a property in the documentation. This will only work -# if the methods are indeed getting or setting a simple type. If this -# is not the case, or you want to show the methods anyway, you should -# set this option to NO. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. - -DISTRIBUTE_GROUP_DOC = NO - -# Set the SUBGROUPING tag to YES (the default) to allow class member groups of -# the same type (for instance a group of public functions) to be put as a -# subgroup of that type (e.g. under the Public Functions section). Set it to -# NO to prevent subgrouping. Alternatively, this can be done per class using -# the \nosubgrouping command. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and -# unions are shown inside the group in which they are included (e.g. using -# @ingroup) instead of on a separate page (for HTML and Man pages) or -# section (for LaTeX and RTF). - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and -# unions with only public data fields will be shown inline in the documentation -# of the scope in which they are defined (i.e. file, namespace, or group -# documentation), provided this scope is documented. If set to NO (the default), -# structs, classes, and unions are shown on a separate page (for HTML and Man -# pages) or section (for LaTeX and RTF). - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum -# is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically -# be useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. - -TYPEDEF_HIDES_STRUCT = NO - -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penalty. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will roughly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols. - -SYMBOL_CACHE_SIZE = 0 - -# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be -# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given -# their name and scope. Since this can be an expensive process and often the -# same symbol appear multiple times in the code, doxygen keeps a cache of -# pre-resolved symbols. If the cache is too small doxygen will become slower. -# If the cache is too large, memory is wasted. The cache size is given by this -# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. -# Private class members and static file members will be hidden unless -# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class -# will be included in the documentation. - -EXTRACT_PRIVATE = YES - -# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal -# scope will be included in the documentation. - -EXTRACT_PACKAGE = NO - -# If the EXTRACT_STATIC tag is set to YES all static members of a file -# will be included in the documentation. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) -# defined locally in source files will be included in the documentation. -# If set to NO only classes defined in header files are included. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. When set to YES local -# methods, which are defined in the implementation section but not in -# the interface are included in the documentation. -# If set to NO (the default) only methods in the interface are included. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base -# name of the file that contains the anonymous namespace. By default -# anonymous namespaces are hidden. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all -# undocumented members of documented classes, files or namespaces. -# If set to NO (the default) these members will be included in the -# various overviews, but no documentation section is generated. -# This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_MEMBERS = YES - -# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. -# If set to NO (the default) these classes will be included in the various -# overviews. This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_CLASSES = YES - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all -# friend (class|struct|union) declarations. -# If set to NO (the default) these declarations will be included in the -# documentation. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any -# documentation blocks found inside the body of a function. -# If set to NO (the default) these blocks will be appended to the -# function's detailed documentation block. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation -# that is typed after a \internal command is included. If the tag is set -# to NO (the default) then the documentation will be excluded. -# Set it to YES to include the internal documentation. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate -# file names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen -# will show members with their full class and namespace scopes in the -# documentation. If set to YES the scope will be hidden. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen -# will put a list of the files that are included by a file in the documentation -# of that file. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen -# will list include files with double quotes in the documentation -# rather than with sharp brackets. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] -# is inserted in the documentation for inline members. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen -# will sort the (detailed) documentation of file and class members -# alphabetically by member name. If set to NO the members will appear in -# declaration order. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the -# brief documentation of file, namespace and class members alphabetically -# by member name. If set to NO (the default) the members will appear in -# declaration order. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen -# will sort the (brief and detailed) documentation of class members so that -# constructors and destructors are listed first. If set to NO (the default) -# the constructors will appear in the respective orders defined by -# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. -# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO -# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the -# hierarchy of group names into alphabetical order. If set to NO (the default) -# the group names will appear in their defined order. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be -# sorted by fully-qualified names, including namespaces. If set to -# NO (the default), the class list will be sorted only by class name, -# not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the -# alphabetical list. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to -# do proper type resolution of all parameters of a function it will reject a -# match between the prototype and the implementation of a member function even -# if there is only one candidate or it is obvious which candidate to choose -# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen -# will still accept a match between prototype and implementation in such cases. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or -# disable (NO) the todo list. This list is created by putting \todo -# commands in the documentation. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or -# disable (NO) the test list. This list is created by putting \test -# commands in the documentation. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or -# disable (NO) the bug list. This list is created by putting \bug -# commands in the documentation. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or -# disable (NO) the deprecated list. This list is created by putting -# \deprecated commands in the documentation. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional -# documentation sections, marked by \if sectionname ... \endif. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines -# the initial value of a variable or macro consists of for it to appear in -# the documentation. If the initializer consists of more lines than specified -# here it will be hidden. Use a value of 0 to hide initializers completely. -# The appearance of the initializer of individual variables and macros in the -# documentation can be controlled using \showinitializer or \hideinitializer -# command in the documentation regardless of this setting. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated -# at the bottom of the documentation of classes and structs. If set to YES the -# list will mention the files that were used to generate the documentation. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. -# This will remove the Files entry from the Quick Index and from the -# Folder Tree View (if specified). The default is YES. - -# We probably will want this, but we have no file documentation yet so it's simpler to remove -# it for now. -SHOW_FILES = NO - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the -# Namespaces page. -# This will remove the Namespaces entry from the Quick Index -# and from the Folder Tree View (if specified). The default is YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command , where is the value of -# the FILE_VERSION_FILTER tag, and is the name of an input file -# provided by doxygen. Whatever the program writes to standard output -# is used as the file version. See the manual for examples. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. -# You can optionally specify a file name after the option, if omitted -# DoxygenLayout.xml will be used as the name of the layout file. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files -# containing the references data. This must be a list of .bib files. The -# .bib extension is automatically appended if omitted. Using this command -# requires the bibtex tool to be installed. See also -# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style -# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this -# feature you need bibtex and perl available in the search path. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated -# by doxygen. Possible values are YES and NO. If left blank NO is used. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated by doxygen. Possible values are YES and NO. If left blank -# NO is used. - -WARNINGS = YES - -# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings -# for undocumented members. If EXTRACT_ALL is set to YES then this flag will -# automatically be disabled. - -WARN_IF_UNDOCUMENTED = YES - -# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some -# parameters in a documented function, or documenting parameters that -# don't exist or using markup commands wrongly. - -WARN_IF_DOC_ERROR = YES - -# The WARN_NO_PARAMDOC option can be enabled to get warnings for -# functions that are documented, but have no documentation for their parameters -# or return value. If set to NO (the default) doxygen will only warn about -# wrong or incomplete parameter documentation, but not about the absence of -# documentation. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that -# doxygen can produce. The string should contain the $file, $line, and $text -# tags, which will be replaced by the file and line number from which the -# warning originated and the warning text. Optionally the format may contain -# $version, which will be replaced by the version of the file (if it could -# be obtained via FILE_VERSION_FILTER) - -WARN_FORMAT = - -# The WARN_LOGFILE tag can be used to specify a file to which warning -# and error messages should be written. If left blank the output is written -# to stderr. - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag can be used to specify the files and/or directories that contain -# documented source files. You may enter file names like "myfile.cpp" or -# directories like "/usr/src/myproject". Separate the files or directories -# with spaces. - -INPUT = src doc/doxygen/libomp_interface.h -# The ittnotify code also has doxygen documentation, but if we include it here -# it takes over from us! -# src/thirdparty/ittnotify - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is -# also the default input encoding. Doxygen uses libiconv (or the iconv built -# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for -# the list of possible encodings. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank the following patterns are tested: -# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh -# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py -# *.f90 *.f *.for *.vhd *.vhdl - -FILE_PATTERNS = *.c *.h *.cpp -# We may also want to include the asm files with appropriate ifdef to ensure -# doxygen doesn't see the content, just the documentation... - -# The RECURSIVE tag can be used to turn specify whether or not subdirectories -# should be searched for input files as well. Possible values are YES and NO. -# If left blank NO is used. - -# Only look in the one directory. -RECURSIVE = NO - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = src/test-touch.c - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. Note that the wildcards are matched -# against the file with absolute path, so to exclude all test directories -# for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or -# directories that contain example code fragments that are included (see -# the \include command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank all files are included. - -EXAMPLE_PATTERNS = - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude -# commands irrespective of the value of the RECURSIVE tag. -# Possible values are YES and NO. If left blank NO is used. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or -# directories that contain image that are included in the documentation (see -# the \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command , where -# is the value of the INPUT_FILTER tag, and is the name of an -# input file. Doxygen will then use the output that the filter program writes -# to standard output. -# If FILTER_PATTERNS is specified, this tag will be -# ignored. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. -# Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. -# The filters are a list of the form: -# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further -# info on how filters are used. If FILTER_PATTERNS is empty or if -# non of the patterns match the file name, INPUT_FILTER is applied. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will be used to filter the input files when producing source -# files to browse (i.e. when SOURCE_BROWSER is set to YES). - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) -# and it is also possible to disable source filtering for a specific pattern -# using *.ext= (so without naming a filter). This option only has effect when -# FILTER_SOURCE_FILES is enabled. - -FILTER_SOURCE_PATTERNS = - -#--------------------------------------------------------------------------- -# configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will -# be generated. Documented entities will be cross-referenced with these sources. -# Note: To get rid of all source code in the generated output, make sure also -# VERBATIM_HEADERS is set to NO. - -SOURCE_BROWSER = YES - -# Setting the INLINE_SOURCES tag to YES will include the body -# of functions and classes directly in the documentation. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct -# doxygen to hide any special comment blocks from generated source code -# fragments. Normal C, C++ and Fortran comments will always remain visible. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES -# then for each documented function all documented -# functions referencing it will be listed. - -REFERENCED_BY_RELATION = YES - -# If the REFERENCES_RELATION tag is set to YES -# then for each documented function all documented entities -# called/used by that function will be listed. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) -# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from -# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will -# link to the source code. -# Otherwise they will link to the documentation. - -REFERENCES_LINK_SOURCE = YES - -# If the USE_HTAGS tag is set to YES then the references to source code -# will point to the HTML generated by the htags(1) tool instead of doxygen -# built-in source browser. The htags tool is part of GNU's global source -# tagging system (see http://www.gnu.org/software/global/global.html). You -# will need version 4.8.6 or higher. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen -# will generate a verbatim copy of the header file for each class for -# which an include is specified. Set to NO to disable this. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index -# of all compounds will be generated. Enable this if the project -# contains a lot of classes, structs, unions or interfaces. - -ALPHABETICAL_INDEX = YES - -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all -# classes will be put under the same header in the alphabetical index. -# The IGNORE_PREFIX tag can be used to specify one or more prefixes that -# should be ignored while generating the index headers. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES (the default) Doxygen will -# generate HTML output. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `html' will be used as the default path. - -HTML_OUTPUT = - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for -# each generated HTML page (for example: .htm,.php,.asp). If it is left blank -# doxygen will generate files with .html extension. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a personal HTML header for -# each generated HTML page. If it is left blank doxygen will generate a -# standard header. Note that when using a custom header you are responsible -# for the proper inclusion of any scripts and style sheets that doxygen -# needs, which is dependent on the configuration options used. -# It is advised to generate a default header using "doxygen -w html -# header.html footer.html stylesheet.css YourConfigFile" and then modify -# that header. Note that the header is subject to change so you typically -# have to redo this when upgrading to a newer version of doxygen or when -# changing the value of configuration settings such as GENERATE_TREEVIEW! - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a personal HTML footer for -# each generated HTML page. If it is left blank doxygen will generate a -# standard footer. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading -# style sheet that is used by each HTML page. It can be used to -# fine-tune the look of the HTML output. If left blank doxygen will -# generate a default style sheet. Note that it is recommended to use -# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this -# tag will in the future become obsolete. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional -# user-defined cascading style sheet that is included after the standard -# style sheets created by doxygen. Using this option one can overrule -# certain style aspects. This is preferred over using HTML_STYLESHEET -# since it does not replace the standard style sheet and is therefor more -# robust against future updates. Doxygen will copy the style sheet file to -# the output directory. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that -# the files will be copied as-is; there are no commands or markers available. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. -# Doxygen will adjust the colors in the style sheet and background images -# according to this color. Hue is specified as an angle on a colorwheel, -# see http://en.wikipedia.org/wiki/Hue for more information. -# For instance the value 0 represents red, 60 is yellow, 120 is green, -# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. -# The allowed range is 0 to 359. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of -# the colors in the HTML output. For a value of 0 the output will use -# grayscales only. A value of 255 will produce the most vivid colors. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to -# the luminance component of the colors in the HTML output. Values below -# 100 gradually make the output lighter, whereas values above 100 make -# the output darker. The value divided by 100 is the actual gamma applied, -# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, -# and 100 does not change the gamma. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting -# this to NO can help when comparing the output of multiple runs. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of -# entries shown in the various tree structured indices initially; the user -# can expand and collapse entries dynamically later on. Doxygen will expand -# the tree to such a level that at most the specified number of entries are -# visible (unless a fully collapsed tree already exceeds this amount). -# So setting the number of entries 1 will produce a full collapsed tree by -# default. 0 is a special value representing an infinite number of entries -# and will result in a full expanded tree by default. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files -# will be generated that can be used as input for Apple's Xcode 3 -# integrated development environment, introduced with OSX 10.5 (Leopard). -# To create a documentation set, doxygen will generate a Makefile in the -# HTML output directory. Running make will produce the docset in that -# directory and running "make install" will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find -# it at startup. -# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. - -GENERATE_DOCSET = NO - -# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the -# feed. A documentation feed provides an umbrella under which multiple -# documentation sets from a single provider (such as a company or product suite) -# can be grouped. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that -# should uniquely identify the documentation set bundle. This should be a -# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen -# will append .docset to the name. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely -# identify the documentation publisher. This should be a reverse domain-name -# style string, e.g. com.mycompany.MyDocSet.documentation. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES, additional index files -# will be generated that can be used as input for tools like the -# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) -# of the generated HTML documentation. - -GENERATE_HTMLHELP = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can -# be used to specify the file name of the resulting .chm file. You -# can add a path in front of the file if the result should not be -# written to the html output directory. - -CHM_FILE = - -# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can -# be used to specify the location (absolute path including file name) of -# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run -# the HTML help compiler on the generated index.hhp. - -HHC_LOCATION = - -# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag -# controls if a separate .chi index file is generated (YES) or that -# it should be included in the main .chm file (NO). - -GENERATE_CHI = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING -# is used to encode HtmlHelp index (hhk), content (hhc) and project file -# content. - -CHM_INDEX_ENCODING = - -# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag -# controls whether a binary table of contents is generated (YES) or a -# normal table of contents (NO) in the .chm file. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members -# to the contents of the HTML help documentation and to the tree view. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated -# that can be used as input for Qt's qhelpgenerator to generate a -# Qt Compressed Help (.qch) of the generated HTML documentation. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can -# be used to specify the file name of the resulting .qch file. -# The path specified is relative to the HTML output folder. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#namespace - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#virtual-folders - -QHP_VIRTUAL_FOLDER = doc - -# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to -# add. For more information please see -# http://doc.trolltech.com/qthelpproject.html#custom-filters - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see -# -# Qt Help Project / Custom Filters. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's -# filter section matches. -# -# Qt Help Project / Filter Attributes. - -QHP_SECT_FILTER_ATTRS = - -# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can -# be used to specify the location of Qt's qhelpgenerator. -# If non-empty doxygen will try to run qhelpgenerator on the generated -# .qhp file. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files -# will be generated, which together with the HTML files, form an Eclipse help -# plugin. To install this plugin and make it available under the help contents -# menu in Eclipse, the contents of the directory containing the HTML and XML -# files needs to be copied into the plugins directory of eclipse. The name of -# the directory within the plugins directory should be the same as -# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before -# the help appears. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have -# this name. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) -# at top of each HTML page. The value NO (the default) enables the index and -# the value YES disables it. Since the tabs have the same information as the -# navigation tree you can set this option to NO if you already set -# GENERATE_TREEVIEW to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. -# If the tag value is set to YES, a side panel will be generated -# containing a tree-like index structure (just like the one that -# is generated for HTML Help). For this to work a browser that supports -# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). -# Windows users are probably better off using the HTML help feature. -# Since the tree basically has the same information as the tab index you -# could consider to set DISABLE_INDEX to NO when enabling this option. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values -# (range [0,1..20]) that doxygen will group on one line in the generated HTML -# documentation. Note that a value of 0 will completely suppress the enum -# values from appearing in the overview section. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be -# used to set the initial width (in pixels) of the frame in which the tree -# is shown. - -TREEVIEW_WIDTH = 250 - -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open -# links to external symbols imported via tag files in a separate window. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of Latex formulas included -# as images in the HTML documentation. The default is 10. Note that -# when you change the font size after a successful doxygen run you need -# to manually remove any form_*.png images from the HTML output directory -# to force them to be regenerated. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are -# not supported properly for IE 6.0, but are supported on all modern browsers. -# Note that when changing this option you need to delete any form_*.png files -# in the HTML output before the changes have effect. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax -# (see http://www.mathjax.org) which uses client side Javascript for the -# rendering instead of using prerendered bitmaps. Use this if you do not -# have LaTeX installed or if you want to formulas look prettier in the HTML -# output. When enabled you may also need to install MathJax separately and -# configure the path to it using the MATHJAX_RELPATH option. - -USE_MATHJAX = NO - -# When MathJax is enabled you need to specify the location relative to the -# HTML output directory using the MATHJAX_RELPATH option. The destination -# directory should contain the MathJax.js script. For instance, if the mathjax -# directory is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to -# the MathJax Content Delivery Network so you can quickly see the result without -# installing MathJax. -# However, it is strongly recommended to install a local -# copy of MathJax from http://www.mathjax.org before deployment. - -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest - -# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension -# names that should be enabled during MathJax rendering. - -MATHJAX_EXTENSIONS = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box -# for the HTML output. The underlying search engine uses javascript -# and DHTML and should work on any modern browser. Note that when using -# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets -# (GENERATE_DOCSET) there is already a search function so this one should -# typically be disabled. For large projects the javascript based search engine -# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. - -SEARCHENGINE = YES - -# When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a PHP enabled web server instead of at the web client -# using Javascript. Doxygen will generate the search PHP script and index -# file to put on the web server. The advantage of the server -# based approach is that it scales better to large projects and allows -# full text search. The disadvantages are that it is more difficult to setup -# and does not have live searching capabilities. - -SERVER_BASED_SEARCH = NO - -#--------------------------------------------------------------------------- -# configuration options related to the LaTeX output -#--------------------------------------------------------------------------- - -# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will -# generate Latex output. - -GENERATE_LATEX = YES - -# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `latex' will be used as the default path. - -LATEX_OUTPUT = - -# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be -# invoked. If left blank `latex' will be used as the default command name. -# Note that when enabling USE_PDFLATEX this option is only used for -# generating bitmaps for formulas in the HTML output, but not in the -# Makefile that is written to the output directory. - -LATEX_CMD_NAME = latex - -# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to -# generate index for LaTeX. If left blank `makeindex' will be used as the -# default command name. - -MAKEINDEX_CMD_NAME = makeindex - -# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact -# LaTeX documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_LATEX = NO - -# The PAPER_TYPE tag can be used to set the paper type that is used -# by the printer. Possible values are: a4, letter, legal and -# executive. If left blank a4wide will be used. - -PAPER_TYPE = a4wide - -# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX -# packages that should be included in the LaTeX output. - -EXTRA_PACKAGES = - -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for -# the generated latex document. The header should contain everything until -# the first chapter. If it is left blank doxygen will generate a -# standard header. Notice: only use this tag if you know what you are doing! - -LATEX_HEADER = doc/doxygen/header.tex - -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for -# the generated latex document. The footer should contain everything after -# the last chapter. If it is left blank doxygen will generate a -# standard footer. Notice: only use this tag if you know what you are doing! - -LATEX_FOOTER = - -# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated -# is prepared for conversion to pdf (using ps2pdf). The pdf file will -# contain links (just like the HTML output) instead of page references -# This makes the output suitable for online browsing using a pdf viewer. - -PDF_HYPERLINKS = YES - -# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of -# plain latex in the generated Makefile. Set this option to YES to get a -# higher quality PDF documentation. - -USE_PDFLATEX = YES - -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. -# command to the generated LaTeX files. This will instruct LaTeX to keep -# running if errors occur, instead of asking the user for help. -# This option is also used when generating formulas in HTML. - -LATEX_BATCHMODE = NO - -# If LATEX_HIDE_INDICES is set to YES then doxygen will not -# include the index chapters (such as File Index, Compound Index, etc.) -# in the output. - -LATEX_HIDE_INDICES = NO - -# If LATEX_SOURCE_CODE is set to YES then doxygen will include -# source code with syntax highlighting in the LaTeX output. -# Note that which sources are shown also depends on other settings -# such as SOURCE_BROWSER. - -LATEX_SOURCE_CODE = NO - -# The LATEX_BIB_STYLE tag can be used to specify the style to use for the -# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See -# http://en.wikipedia.org/wiki/BibTeX for more info. - -LATEX_BIB_STYLE = plain - -#--------------------------------------------------------------------------- -# configuration options related to the RTF output -#--------------------------------------------------------------------------- - -# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output -# The RTF output is optimized for Word 97 and may not look very pretty with -# other RTF readers or editors. - -GENERATE_RTF = NO - -# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `rtf' will be used as the default path. - -RTF_OUTPUT = - -# If the COMPACT_RTF tag is set to YES Doxygen generates more compact -# RTF documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_RTF = NO - -# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated -# will contain hyperlink fields. The RTF file will -# contain links (just like the HTML output) instead of page references. -# This makes the output suitable for online browsing using WORD or other -# programs which support those fields. -# Note: wordpad (write) and others do not support links. - -RTF_HYPERLINKS = NO - -# Load style sheet definitions from file. Syntax is similar to doxygen's -# config file, i.e. a series of assignments. You only have to provide -# replacements, missing definitions are set to their default value. - -RTF_STYLESHEET_FILE = - -# Set optional variables used in the generation of an rtf document. -# Syntax is similar to doxygen's config file. - -RTF_EXTENSIONS_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to the man page output -#--------------------------------------------------------------------------- - -# If the GENERATE_MAN tag is set to YES (the default) Doxygen will -# generate man pages - -GENERATE_MAN = NO - -# The MAN_OUTPUT tag is used to specify where the man pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `man' will be used as the default path. - -MAN_OUTPUT = - -# The MAN_EXTENSION tag determines the extension that is added to -# the generated man pages (default is the subroutine's section .3) - -MAN_EXTENSION = - -# If the MAN_LINKS tag is set to YES and Doxygen generates man output, -# then it will generate one additional man file for each entity -# documented in the real man page(s). These additional files -# only source the real man page, but without them the man command -# would be unable to find the correct page. The default is NO. - -MAN_LINKS = NO - -#--------------------------------------------------------------------------- -# configuration options related to the XML output -#--------------------------------------------------------------------------- - -# If the GENERATE_XML tag is set to YES Doxygen will -# generate an XML file that captures the structure of -# the code including all documentation. - -GENERATE_XML = NO - -# The XML_OUTPUT tag is used to specify where the XML pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `xml' will be used as the default path. - -XML_OUTPUT = xml - -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - -# If the XML_PROGRAMLISTING tag is set to YES Doxygen will -# dump the program listings (including syntax highlighting -# and cross-referencing information) to the XML output. Note that -# enabling this will significantly increase the size of the XML output. - -XML_PROGRAMLISTING = YES - -#--------------------------------------------------------------------------- -# configuration options for the AutoGen Definitions output -#--------------------------------------------------------------------------- - -# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will -# generate an AutoGen Definitions (see autogen.sf.net) file -# that captures the structure of the code including all -# documentation. Note that this feature is still experimental -# and incomplete at the moment. - -GENERATE_AUTOGEN_DEF = NO - -#--------------------------------------------------------------------------- -# configuration options related to the Perl module output -#--------------------------------------------------------------------------- - -# If the GENERATE_PERLMOD tag is set to YES Doxygen will -# generate a Perl module file that captures the structure of -# the code including all documentation. Note that this -# feature is still experimental and incomplete at the -# moment. - -GENERATE_PERLMOD = NO - -# If the PERLMOD_LATEX tag is set to YES Doxygen will generate -# the necessary Makefile rules, Perl scripts and LaTeX code to be able -# to generate PDF and DVI output from the Perl module output. - -PERLMOD_LATEX = NO - -# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be -# nicely formatted so it can be parsed by a human reader. -# This is useful -# if you want to understand what is going on. -# On the other hand, if this -# tag is set to NO the size of the Perl module output will be much smaller -# and Perl will parse it just the same. - -PERLMOD_PRETTY = YES - -# The names of the make variables in the generated doxyrules.make file -# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. -# This is useful so different doxyrules.make files included by the same -# Makefile don't overwrite each other's variables. - -PERLMOD_MAKEVAR_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the preprocessor -#--------------------------------------------------------------------------- - -# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will -# evaluate all C-preprocessor directives found in the sources and include -# files. - -ENABLE_PREPROCESSING = YES - -# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro -# names in the source code. If set to NO (the default) only conditional -# compilation will be performed. Macro expansion can be done in a controlled -# way by setting EXPAND_ONLY_PREDEF to YES. - -MACRO_EXPANSION = YES - -# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES -# then the macro expansion is limited to the macros specified with the -# PREDEFINED and EXPAND_AS_DEFINED tags. - -EXPAND_ONLY_PREDEF = YES - -# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files -# pointed to by INCLUDE_PATH will be searched when a #include is found. - -SEARCH_INCLUDES = YES - -# The INCLUDE_PATH tag can be used to specify one or more directories that -# contain include files that are not input files but should be processed by -# the preprocessor. - -INCLUDE_PATH = - -# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard -# patterns (like *.h and *.hpp) to filter out the header-files in the -# directories. If left blank, the patterns specified with FILE_PATTERNS will -# be used. - -INCLUDE_FILE_PATTERNS = - -# The PREDEFINED tag can be used to specify one or more macro names that -# are defined before the preprocessor is started (similar to the -D option of -# gcc). The argument of the tag is a list of macros of the form: name -# or name=definition (no spaces). If the definition and the = are -# omitted =1 is assumed. To prevent a macro definition from being -# undefined via #undef or recursively expanded use the := operator -# instead of the = operator. - -PREDEFINED = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1 - -# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then -# this tag can be used to specify a list of macro names that should be expanded. -# The macro definition that is found in the sources will be used. -# Use the PREDEFINED tag if you want to use a different macro definition that -# overrules the definition found in the source code. - -EXPAND_AS_DEFINED = - -# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then -# doxygen's preprocessor will remove all references to function-like macros -# that are alone on a line, have an all uppercase name, and do not end with a -# semicolon, because these will confuse the parser if not removed. - -SKIP_FUNCTION_MACROS = YES - -#--------------------------------------------------------------------------- -# Configuration::additions related to external references -#--------------------------------------------------------------------------- - -# The TAGFILES option can be used to specify one or more tagfiles. For each -# tag file the location of the external documentation should be added. The -# format of a tag file without this location is as follows: -# -# TAGFILES = file1 file2 ... -# Adding location for the tag files is done as follows: -# -# TAGFILES = file1=loc1 "file2 = loc2" ... -# where "loc1" and "loc2" can be relative or absolute paths -# or URLs. Note that each tag file must have a unique name (where the name does -# NOT include the path). If a tag file is not located in the directory in which -# doxygen is run, you must also specify the path to the tagfile here. - -TAGFILES = - -# When a file name is specified after GENERATE_TAGFILE, doxygen will create -# a tag file that is based on the input files it reads. - -GENERATE_TAGFILE = - -# If the ALLEXTERNALS tag is set to YES all external classes will be listed -# in the class index. If set to NO only the inherited external classes -# will be listed. - -ALLEXTERNALS = NO - -# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will -# be listed. - -EXTERNAL_GROUPS = YES - -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = - -#--------------------------------------------------------------------------- -# Configuration options related to the dot tool -#--------------------------------------------------------------------------- - -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option also works with HAVE_DOT disabled, but it is recommended to -# install and use dot, since it yields more powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see -# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - -# If set to YES, the inheritance and collaboration graphs will hide -# inheritance and usage relations if the target is undocumented -# or is not a class. - -HIDE_UNDOC_RELATIONS = YES - -# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is -# available from the path. This tool is part of Graphviz, a graph visualization -# toolkit from AT&T and Lucent Bell Labs. The other options in this section -# have no effect if this option is set to NO (the default) - -HAVE_DOT = NO - -# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is -# allowed to run in parallel. When set to 0 (the default) doxygen will -# base this on the number of processors available in the system. You can set it -# explicitly to a value larger than 0 to get control over the balance -# between CPU load and processing speed. - -DOT_NUM_THREADS = 0 - -# By default doxygen will use the Helvetica font for all dot files that -# doxygen generates. When you want a differently looking font you can specify -# the font name using DOT_FONTNAME. You need to make sure dot is able to find -# the font, which can be done by putting it in a standard location or by setting -# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the -# directory containing the font. - -DOT_FONTNAME = Helvetica - -# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. -# The default size is 10pt. - -DOT_FONTSIZE = 10 - -# By default doxygen will tell dot to use the Helvetica font. -# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to -# set the path where dot can find it. - -DOT_FONTPATH = - -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# CLASS_DIAGRAMS tag to NO. - -CLASS_GRAPH = YES - -# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect implementation dependencies (inheritance, containment, and -# class references variables) of the class with other documented classes. - -COLLABORATION_GRAPH = NO - -# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for groups, showing the direct groups dependencies - -GROUP_GRAPHS = YES - -# If the UML_LOOK tag is set to YES doxygen will generate inheritance and -# collaboration diagrams in a style similar to the OMG's Unified Modeling -# Language. - -UML_LOOK = NO - -# If the UML_LOOK tag is enabled, the fields and methods are shown inside -# the class node. If there are many fields or methods and many nodes the -# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS -# threshold limits the number of items for each type to make the size more -# manageable. Set this to 0 for no limit. Note that the threshold may be -# exceeded by 50% before the limit is enforced. - -UML_LIMIT_NUM_FIELDS = 10 - -# If set to YES, the inheritance and collaboration graphs will show the -# relations between templates and their instances. - -TEMPLATE_RELATIONS = YES - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT -# tags are set to YES then doxygen will generate a graph for each documented -# file showing the direct and indirect include dependencies of the file with -# other documented files. - -INCLUDE_GRAPH = NO - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and -# HAVE_DOT tags are set to YES then doxygen will generate a graph for each -# documented header file showing the documented files that directly or -# indirectly include this file. - -INCLUDED_BY_GRAPH = NO - -# If the CALL_GRAPH and HAVE_DOT options are set to YES then -# doxygen will generate a call dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable call graphs -# for selected functions only using the \callgraph command. - -CALL_GRAPH = NO - -# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then -# doxygen will generate a caller dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable caller -# graphs for selected functions only using the \callergraph command. - -CALLER_GRAPH = NO - -# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen -# will generate a graphical hierarchy of all classes instead of a textual one. - -GRAPHICAL_HIERARCHY = YES - -# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES -# then doxygen will show the dependencies a directory has on other directories -# in a graphical way. The dependency relations are determined by the #include -# relations between the files in the directories. - -DIRECTORY_GRAPH = YES - -# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images -# generated by dot. Possible values are svg, png, jpg, or gif. -# If left blank png will be used. If you choose svg you need to set -# HTML_FILE_EXTENSION to xhtml in order to make the SVG files -# visible in IE 9+ (other browsers do not have this requirement). - -DOT_IMAGE_FORMAT = png - -# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to -# enable generation of interactive SVG images that allow zooming and panning. -# Note that this requires a modern browser other than Internet Explorer. -# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you -# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files -# visible. Older versions of IE do not have SVG support. - -INTERACTIVE_SVG = NO - -# The tag DOT_PATH can be used to specify the path where the dot tool can be -# found. If left blank, it is assumed the dot tool can be found in the path. - -DOT_PATH = - -# The DOTFILE_DIRS tag can be used to specify one or more directories that -# contain dot files that are included in the documentation (see the -# \dotfile command). - -DOTFILE_DIRS = - -# The MSCFILE_DIRS tag can be used to specify one or more directories that -# contain msc files that are included in the documentation (see the -# \mscfile command). - -MSCFILE_DIRS = - -# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of -# nodes that will be shown in the graph. If the number of nodes in a graph -# becomes larger than this value, doxygen will truncate the graph, which is -# visualized by representing a node as a red box. Note that doxygen if the -# number of direct children of the root node in a graph is already larger than -# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note -# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. - -DOT_GRAPH_MAX_NODES = 50 - -# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the -# graphs generated by dot. A depth value of 3 means that only nodes reachable -# from the root by following a path via at most 3 edges will be shown. Nodes -# that lay further from the root node will be omitted. Note that setting this -# option to 1 or 2 may greatly reduce the computation time needed for large -# code bases. Also note that the size of a graph can be further restricted by -# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. - -MAX_DOT_GRAPH_DEPTH = 0 - -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not -# seem to support this out of the box. Warning: Depending on the platform used, -# enabling this option may lead to badly anti-aliased labels on the edges of -# a graph (i.e. they become hard to read). - -DOT_TRANSPARENT = NO - -# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output -# files in one run (i.e. multiple -o and -T options on the command line). This -# makes dot run faster, but since only newer versions of dot (>1.8.10) -# support this, this feature is disabled by default. - -DOT_MULTI_TARGETS = NO - -# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will -# generate a legend page explaining the meaning of the various boxes and -# arrows in the dot generated graphs. - -GENERATE_LEGEND = YES - -# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will -# remove the intermediate dot files that are used to generate -# the various graphs. - -DOT_CLEANUP = YES +# Doxyfile 1.o8.2 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or sequence of words) that should +# identify the project. Note that if you do not use Doxywizard you need +# to put quotes around the project name if it contains spaces. + +PROJECT_NAME = "LLVM OpenMP* Runtime Library" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc/doxygen/generated + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. Note that you specify absolute paths here, but also +# relative paths, which will be relative from the directory where doxygen is +# started. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = "other=*" + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding +# "class=itcl::class" will allow you to use the command class in the +# itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, +# and language is one of the parsers supported by doxygen: IDL, Java, +# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, +# C++. For instance to make doxygen treat .inc files as Fortran files (default +# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note +# that for custom extensions you also need to set FILE_PATTERNS otherwise the +# files are not read by doxygen. + +EXTENSION_MAPPING = + +# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all +# comments according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you +# can mix doxygen, HTML, and XML commands with Markdown formatting. +# Disable only in case of backward compatibilities issues. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented classes, +# or namespaces to their corresponding documentation. Such a link can be +# prevented in individual cases by by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to +# indicate getter and setter methods for a property. Setting this +# option to YES (the default) will make doxygen replace the get and +# set methods by a property in the documentation. This will only work +# if the methods are indeed getting or setting a simple type. If this +# is not the case, or you want to show the methods anyway, you should +# set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and +# unions with only public data fields will be shown inline in the documentation +# of the scope in which they are defined (i.e. file, namespace, or group +# documentation), provided this scope is documented. If set to NO (the default), +# structs, classes, and unions are shown on a separate page (for HTML and Man +# pages) or section (for LaTeX and RTF). + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +SYMBOL_CACHE_SIZE = 0 + +# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be +# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given +# their name and scope. Since this can be an expensive process and often the +# same symbol appear multiple times in the code, doxygen keeps a cache of +# pre-resolved symbols. If the cache is too small doxygen will become slower. +# If the cache is too large, memory is wasted. The cache size is given by this +# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal +# scope will be included in the documentation. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +# We probably will want this, but we have no file documentation yet so it's simpler to remove +# it for now. +SHOW_FILES = NO + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files +# containing the references data. This must be a list of .bib files. The +# .bib extension is automatically appended if omitted. Using this command +# requires the bibtex tool to be installed. See also +# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style +# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this +# feature you need bibtex and perl available in the search path. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = src doc/doxygen/libomp_interface.h +# The ittnotify code also has doxygen documentation, but if we include it here +# it takes over from us! +# src/thirdparty/ittnotify + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = *.c *.h *.cpp +# We may also want to include the asm files with appropriate ifdef to ensure +# doxygen doesn't see the content, just the documentation... + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +# Only look in the one directory. +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = src/test-touch.c + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C, C++ and Fortran comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is advised to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when +# changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If left blank doxygen will +# generate a default style sheet. Note that it is recommended to use +# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this +# tag will in the future become obsolete. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional +# user-defined cascading style sheet that is included after the standard +# style sheets created by doxygen. Using this option one can overrule +# certain style aspects. This is preferred over using HTML_STYLESHEET +# since it does not replace the standard style sheet and is therefor more +# robust against future updates. Doxygen will copy the style sheet file to +# the output directory. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the style sheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of +# entries shown in the various tree structured indices initially; the user +# can expand and collapse entries dynamically later on. Doxygen will expand +# the tree to such a level that at most the specified number of entries are +# visible (unless a fully collapsed tree already exceeds this amount). +# So setting the number of entries 1 will produce a full collapsed tree by +# default. 0 is a special value representing an infinite number of entries +# and will result in a full expanded tree by default. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely +# identify the documentation publisher. This should be a reverse domain-name +# style string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the main .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) +# at top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. Since the tabs have the same information as the +# navigation tree you can set this option to NO if you already set +# GENERATE_TREEVIEW to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. +# Since the tree basically has the same information as the tab index you +# could consider to set DISABLE_INDEX to NO when enabling this option. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you may also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to +# the MathJax Content Delivery Network so you can quickly see the result without +# installing MathJax. +# However, it is strongly recommended to install a local +# copy of MathJax from http://www.mathjax.org before deployment. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension +# names that should be enabled during MathJax rendering. + +MATHJAX_EXTENSIONS = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = doc/doxygen/header.tex + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +# The LATEX_BIB_STYLE tag can be used to specify the style to use for the +# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See +# http://en.wikipedia.org/wiki/BibTeX for more info. + +LATEX_BIB_STYLE = plain + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load style sheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = YES + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = YES + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1 + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. For each +# tag file the location of the external documentation should be added. The +# format of a tag file without this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths +# or URLs. Note that each tag file must have a unique name (where the name does +# NOT include the path). If a tag file is not located in the directory in which +# doxygen is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will use the Helvetica font for all dot files that +# doxygen generates. When you want a differently looking font you can specify +# the font name using DOT_FONTNAME. You need to make sure dot is able to find +# the font, which can be done by putting it in a standard location or by setting +# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the Helvetica font. +# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to +# set the path where dot can find it. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = NO + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If the UML_LOOK tag is enabled, the fields and methods are shown inside +# the class node. If there are many fields or methods and many nodes the +# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS +# threshold limits the number of items for each type to make the size more +# manageable. Set this to 0 for no limit. Note that the threshold may be +# exceeded by 50% before the limit is enforced. + +UML_LIMIT_NUM_FIELDS = 10 + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = NO + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. If you choose svg you need to set +# HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible in IE 9+ (other browsers do not have this requirement). + +DOT_IMAGE_FORMAT = png + +# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to +# enable generation of interactive SVG images that allow zooming and panning. +# Note that this requires a modern browser other than Internet Explorer. +# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you +# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible. Older versions of IE do not have SVG support. + +INTERACTIVE_SVG = NO + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/pstl/CREDITS.txt b/pstl/CREDITS.txt index 174722510fde..4945fd5ad308 100644 --- a/pstl/CREDITS.txt +++ b/pstl/CREDITS.txt @@ -1,21 +1,21 @@ -This file is a partial list of people who have contributed to the LLVM/pstl -(Parallel STL) project. If you have contributed a patch or made some other -contribution to LLVM/pstl, please submit a patch to this file to add yourself, -and it will be done! - -The list is sorted by surname and formatted to allow easy grepping and -beautification by scripts. The fields are: name (N), email (E), web-address -(W), PGP key ID and fingerprint (P), description (D), and snail-mail address -(S). - -N: Intel Corporation -W: http://www.intel.com -D: Created the initial implementation. - -N: Thomas Rodgers -E: trodgers@redhat.com -D: Identifier name transformation for inclusion in a Standard C++ library. - -N: Christopher Nelson -E: nadiasvertex@gmail.com -D: Add support for an OpenMP backend. +This file is a partial list of people who have contributed to the LLVM/pstl +(Parallel STL) project. If you have contributed a patch or made some other +contribution to LLVM/pstl, please submit a patch to this file to add yourself, +and it will be done! + +The list is sorted by surname and formatted to allow easy grepping and +beautification by scripts. The fields are: name (N), email (E), web-address +(W), PGP key ID and fingerprint (P), description (D), and snail-mail address +(S). + +N: Intel Corporation +W: http://www.intel.com +D: Created the initial implementation. + +N: Thomas Rodgers +E: trodgers@redhat.com +D: Identifier name transformation for inclusion in a Standard C++ library. + +N: Christopher Nelson +E: nadiasvertex@gmail.com +D: Add support for an OpenMP backend. -- GitLab From d5746d73cedcf7a593dc4b4f2ce2465e2d45750b Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Fri, 18 Oct 2024 22:20:47 +0200 Subject: [PATCH 108/511] eliminating g++ warnings (#105520) Eliminating g++ warnings. Mostly declaring "[[maybe_unused]]", adding return statements where missing and fixing casts. @rengolin --------- Co-authored-by: Benjamin Maxwell Co-authored-by: Renato Golin --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- mlir/CMakeLists.txt | 21 ++++++++++++------- mlir/lib/CAPI/IR/IR.cpp | 1 + .../Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp | 3 +++ .../Conversion/IndexToSPIRV/IndexToSPIRV.cpp | 1 + .../Debug/DebuggerExecutionContextHook.cpp | 4 ++-- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 2 +- .../ArmSME/Transforms/TileAllocation.cpp | 1 + .../Async/Transforms/AsyncParallelFor.cpp | 5 ----- mlir/lib/Dialect/Index/IR/IndexOps.cpp | 1 + .../Transforms/Utils/DialectConversion.cpp | 10 ++++----- mlir/unittests/Bytecode/BytecodeTest.cpp | 2 +- .../Support/CyclicReplacerCacheTest.cpp | 3 ++- 13 files changed, 32 insertions(+), 24 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 327e7f7f8a1e..bf4c707cca06 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2849,7 +2849,7 @@ void AsmPrinter::emitJumpTableSizesSection(const MachineJumpTableInfo *MJTI, if (isElf) { MCSymbolELF *LinkedToSym = dyn_cast(CurrentFnSym); - int Flags = F.hasComdat() ? (unsigned)ELF::SHF_GROUP : 0; + int Flags = F.hasComdat() ? static_cast(ELF::SHF_GROUP) : 0; JumpTableSizesSection = OutContext.getELFSection( sectionName, ELF::SHT_LLVM_JT_SIZES, Flags, 0, GroupName, F.hasComdat(), diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index c6d44908a111..599a1cbaafd8 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -84,13 +84,20 @@ check_c_compiler_flag("-Werror=mismatched-tags" C_SUPPORTS_WERROR_MISMATCHED_TAG append_if(C_SUPPORTS_WERROR_MISMATCHED_TAGS "-Werror=mismatched-tags" CMAKE_C_FLAGS) append_if(C_SUPPORTS_WERROR_MISMATCHED_TAGS "-Werror=mismatched-tags" CMAKE_CXX_FLAGS) -# Silence a false positive GCC -Wunused-but-set-parameter warning in constexpr -# cases, by marking SelectedCase as used. See -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85827 for details. The issue is -# fixed in GCC 10. -if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0") - check_cxx_compiler_flag("-Wno-unused-but-set-parameter" CXX_SUPPORTS_WNO_UNUSED_BUT_SET_PARAMETER) - append_if(CXX_SUPPORTS_WNO_UNUSED_BUT_SET_PARAMETER "-Wno-unused-but-set-parameter" CMAKE_CXX_FLAGS) +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + # Silence a false positive GCC -Wunused-but-set-parameter warning in + # constexpr cases. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85827 + # for details + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "14.0") + check_cxx_compiler_flag("-Wno-unused-but-set-parameter" CXX_SUPPORTS_WNO_UNUSED_BUT_SET_PARAMETER) + append_if(CXX_SUPPORTS_WNO_UNUSED_BUT_SET_PARAMETER "-Wno-unused-but-set-parameter" CMAKE_CXX_FLAGS) + endif() + # Silence a false positive GCC -Wdeprecated-copy warning in cases where + # a copy operator is defined through "using" a base class copy operator. + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "12.0") + check_cxx_compiler_flag("-Wno-deprecated-copy" CXX_SUPPORTS_WNO_DEPRECTAED_COPY) + append_if(CXX_SUPPORTS_WNO_DEPRECTAED_COPY "-Wno-deprecated-copy" CMAKE_CXX_FLAGS) + endif() endif() # Installing the headers and docs needs to depend on generating any public diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index 5eb531b70aee..e7e6b11c81b9 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -736,6 +736,7 @@ static mlir::WalkResult unwrap(MlirWalkResult result) { case MlirWalkResultSkip: return mlir::WalkResult::skip(); } + llvm_unreachable("unknown result in WalkResult::unwrap"); } void mlirOperationWalk(MlirOperation op, MlirOperationWalkCallback callback, diff --git a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp index f1fa411b8291..40a3489f7a4d 100644 --- a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp +++ b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp @@ -81,6 +81,7 @@ static Operation *createLoadTileSliceIntrinsic( break; } } + llvm_unreachable("unknown type in createLoadTileSliceIntrinsic"); } /// Helper to create an arm_sme.intr.st1*.(horiz|vert)' intrinsic. @@ -125,6 +126,7 @@ static Operation *createStoreTileSliceIntrinsic( loc, maskOp, ptr, tileId, tileSliceI32); } } + llvm_unreachable("unknown type in createStoreTileSliceIntrinsic"); } IntegerAttr getTileIdOrError(arm_sme::ArmSMETileOpInterface op) { @@ -850,6 +852,7 @@ struct StreamingVLOpConversion case arm_sme::TypeSize::Double: return rewriter.create(loc, i64Type); } + llvm_unreachable("unknown type size in StreamingVLOpConversion"); }(); rewriter.replaceOpWithNewOp( streamingVlOp, rewriter.getIndexType(), intrOp->getResult(0)); diff --git a/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp b/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp index b4cc8324883e..7c441830e1e3 100644 --- a/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp +++ b/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp @@ -310,6 +310,7 @@ struct ConvertIndexCmpPattern final : OpConversionPattern { case IndexCmpPredicate::ULT: return rewriteCmpOp(op, adaptor, rewriter); } + llvm_unreachable("Unknown predicate in ConvertIndexCmpPattern"); } }; diff --git a/mlir/lib/Debug/DebuggerExecutionContextHook.cpp b/mlir/lib/Debug/DebuggerExecutionContextHook.cpp index 744a0380ec71..863113928d5b 100644 --- a/mlir/lib/Debug/DebuggerExecutionContextHook.cpp +++ b/mlir/lib/Debug/DebuggerExecutionContextHook.cpp @@ -301,7 +301,7 @@ void mlirDebuggerAddFileLineColLocBreakpoint(const char *file, int line, LLVM_ATTRIBUTE_NOINLINE void mlirDebuggerBreakpointHook() { static LLVM_THREAD_LOCAL void *volatile sink; - sink = (void *)&sink; + sink = static_cast(const_cast(&sink)); } static void preventLinkerDeadCodeElim() { @@ -321,7 +321,7 @@ static void preventLinkerDeadCodeElim() { sink = (void *)mlirDebuggerAddTagBreakpoint; sink = (void *)mlirDebuggerAddRewritePatternBreakpoint; sink = (void *)mlirDebuggerAddFileLineColLocBreakpoint; - sink = (void *)&sink; + sink = static_cast(const_cast(&sink)); return true; }(); (void)initialized; diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 63447baa31eb..492e4781f578 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -349,7 +349,7 @@ LogicalResult DPPOp::verify() { return emitOpError("quad_perm attribute must have exactly 4 elements"); } for (auto elem : quadPermAttr.getAsRange()) { - uint32_t num = elem.getInt(); + int32_t num = elem.getInt(); if (num < 0 || num > 3) { return emitOpError( "Each element of quad_perm must be in the range [0, 3]"); diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp index 3a2042d23e53..84556fbefbc9 100644 --- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp +++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp @@ -137,6 +137,7 @@ static ArrayRef getMasks(ArmSMETileType type) { case ArmSMETileType::ZAQ: return ZA_Q_MASKS; } + llvm_unreachable("unknown type in getMasks"); } class TileAllocator { diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp index 8c3e25355f60..273101ce5f3e 100644 --- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp @@ -141,7 +141,6 @@ struct ParallelComputeFunctionArgs { BlockArgument blockSize(); ArrayRef tripCounts(); ArrayRef lowerBounds(); - ArrayRef upperBounds(); ArrayRef steps(); ArrayRef captures(); @@ -175,10 +174,6 @@ ArrayRef ParallelComputeFunctionArgs::lowerBounds() { return args.drop_front(2 + 1 * numLoops).take_front(numLoops); } -ArrayRef ParallelComputeFunctionArgs::upperBounds() { - return args.drop_front(2 + 2 * numLoops).take_front(numLoops); -} - ArrayRef ParallelComputeFunctionArgs::steps() { return args.drop_front(2 + 3 * numLoops).take_front(numLoops); } diff --git a/mlir/lib/Dialect/Index/IR/IndexOps.cpp b/mlir/lib/Dialect/Index/IR/IndexOps.cpp index 42401dae217c..5ad989b7da12 100644 --- a/mlir/lib/Dialect/Index/IR/IndexOps.cpp +++ b/mlir/lib/Dialect/Index/IR/IndexOps.cpp @@ -594,6 +594,7 @@ static bool compareSameArgs(IndexCmpPredicate pred) { case IndexCmpPredicate::ULT: return false; } + llvm_unreachable("unknown predicate in compareSameArgs"); } OpFoldResult CmpOp::fold(FoldAdaptor adaptor) { diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 1baddd881f6a..b8d0329906a8 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1045,9 +1045,8 @@ UnresolvedMaterializationRewrite::UnresolvedMaterializationRewrite( const TypeConverter *converter, MaterializationKind kind, Type originalType) : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op), converterAndKind(converter, kind), originalType(originalType) { - assert(!originalType || - kind == MaterializationKind::Target && - "original type is valid only for target materializations"); + assert((!originalType || kind == MaterializationKind::Target) && + "original type is valid only for target materializations"); rewriterImpl.unresolvedMaterializations[op] = this; } @@ -1337,9 +1336,8 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization( MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc, ValueRange inputs, Type outputType, Type originalType, const TypeConverter *converter) { - assert(!originalType || - kind == MaterializationKind::Target && - "original type is valid only for target materializations"); + assert((!originalType || kind == MaterializationKind::Target) && + "original type is valid only for target materializations"); // Avoid materializing an unnecessary cast. if (inputs.size() == 1 && inputs.front().getType() == outputType) diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp index 0342f294f38d..baf5d5c650e3 100644 --- a/mlir/unittests/Bytecode/BytecodeTest.cpp +++ b/mlir/unittests/Bytecode/BytecodeTest.cpp @@ -54,7 +54,7 @@ TEST(Bytecode, MultiModuleWithResource) { constexpr size_t kAlignment = 0x20; size_t bufferSize = buffer.size(); buffer.reserve(bufferSize + kAlignment - 1); - size_t pad = ~(uintptr_t)buffer.data() + 1 & kAlignment - 1; + size_t pad = (~(uintptr_t)buffer.data() + 1) & (kAlignment - 1); buffer.insert(0, pad, ' '); StringRef alignedBuffer(buffer.data() + pad, bufferSize); diff --git a/mlir/unittests/Support/CyclicReplacerCacheTest.cpp b/mlir/unittests/Support/CyclicReplacerCacheTest.cpp index 64a8ab72b69b..26f0709f7d83 100644 --- a/mlir/unittests/Support/CyclicReplacerCacheTest.cpp +++ b/mlir/unittests/Support/CyclicReplacerCacheTest.cpp @@ -225,7 +225,8 @@ public: /// Add a recursive-self-node, i.e. a duplicate of the original node that is /// meant to represent an indirection to it. std::pair addRecursiveSelfNode(Graph::Node originalId) { - return {addNode(originalId, nextRecursionId), nextRecursionId++}; + auto node = addNode(originalId, nextRecursionId); + return {node, nextRecursionId++}; } void addEdge(Node src, Node sink) { connections.addEdge(src, sink); } -- GitLab From 7437f3ef7e2c56f0f54154bba1260150bbf7a59e Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 18 Oct 2024 13:24:15 -0700 Subject: [PATCH 109/511] [lldb] Document SymbolFileJSON (#112938) I've had multiple request for documentation about the JSON symbol file format that LLDB supports. This patch documents the structure and fields, shows a handful of examples and explains how to use it in LLDB. --- lldb/docs/resources/symbolfilejson.rst | 204 +++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 lldb/docs/resources/symbolfilejson.rst diff --git a/lldb/docs/resources/symbolfilejson.rst b/lldb/docs/resources/symbolfilejson.rst new file mode 100644 index 000000000000..9d15d704f357 --- /dev/null +++ b/lldb/docs/resources/symbolfilejson.rst @@ -0,0 +1,204 @@ +JSON Symbol File Format +======================= + +The JSON symbol file format encodes symbols in a text based, human readable +format. JSON symbol files can be used to symbolicate programs that lack symbol +information, for example because they have been stripped. + +Under the hood, the JSON symbol file format is also used by the crashlog +script, specifically to provide symbol information for interactive crashlogs. + +Format +------ + +The symbol file consists of a single JSON object with the following top level +keys: + +* ``triple`` (string) +* ``uuid`` (string) +* ``type`` (string, optional) +* ``sections`` (array, optional) +* ``symbols`` (array, optional) + +The ``triple``, ``uuid`` and ``type`` form the header and should therefore come +first. The ``type`` field is optional. The body consists ``sections`` and +``symbols``. Both arrays are optional, and can be omitted and are allowed to be +empty. + +triple +`````` + +The triple is a string with the triple of the object file it corresponds to. +The triple follows the same format as used by LLVM: +``---``. + +.. code-block:: JSON + + { "triple": "arm64-apple-darwin22.0.0" } + +uuid +```` + +The UUID is a string with the textual representation of the UUID of the object +file it corresponds to. The UUID is represented as outlined in RFC 4122: with +32 hexadecimal digits, displayed in five groups separated by hyphens, in the +form 8-4-4-4-12 for a total of 36 characters (32 alphanumeric characters and +four hyphens). + +.. code-block:: JSON + + { "uuid": "2107157B-6D7E-39F6-806D-AECDC15FC533" } + +type +```` +The optional ``type`` field allows you to specify the type of object file the +JSON file represent. This is often unnecessary, and can be omitted, in which +case the file is considered of the type ``DebugInfo``. + +Valid values for the ``type`` field are: + +* ``corefile``: A core file that has a checkpoint of a program's execution state. +* ``executable``: A normal executable. +* ``debuginfo``: An object file that contains only debug information. +* ``dynamiclinker``: The platform's dynamic linker executable. +* ``objectfile``: An intermediate object file. +* ``sharedlibrary``: A shared library that can be used during execution. +* ``stublibrary``: A library that can be linked against but not used for execution. +* ``jit``: JIT code that has symbols, sections and possibly debug info. + + +sections +```````` + +* ``name``: a string representing the section name. +* ``type``: a string representing the section type (see below). +* ``address``: a number representing the section file address. +* ``size``: a number representing the section size in bytes. + +.. code-block:: JSON + + { + "name": "__TEXT", + "type": "code", + "address": 0, + "size": 546, + } + +The ``type`` field accepts the following values: ``code``, ``container``, +``data``, ``debug``. + +symbols +``````` + +Symbols are JSON objects with the following keys: + +* ``name``: a string representing the string name. +* ``value``: a number representing the symbol value. +* ``address``: a number representing the symbol address in a section. +* ``size``: a number representing the symbol size. +* ``type``: an optional string representing the symbol type (see below). + +A symbol must contain either a ``value`` or an ``address``. The ``type`` is +optional. + +.. code-block:: JSON + + { + "name": "foo", + "type": "code", + "size": 10, + "address": 4294983544, + } + +The ``type`` field accepts any type in the ``lldb::SymbolType`` enum in +`lldb-enumerations.h `_ +, without the ``eSymbolType``. For example ``code`` maps to ``eSymbolTypeCode`` +and ``variableType`` to ``eSymbolTypeVariableType``. + +Usage +----- + +Symbol files can be added with the ``target symbol add`` command. The triple +and UUID will be used to match it to the correct module. + +.. code-block:: shell + + (lldb) target symbol add /path/to/symbol.json + symbol file '/path/to/symbol.json' has been added to '/path/to/executable' + +You can use ``image list`` to confirm that the symbol file has been associated +with the module. + +.. code-block:: shell + + (lldb) image list + [ 0] A711AB38-1FB1-38B1-B38B-859352ED2A20 0x0000000100000000 /path/to/executable + /path/to/symbol.json + [ 1] 4BF76A72-53CC-3E42-8945-4E314C101535 0x00000001800c6000 /usr/lib/dyld + + +Example +------- + +The simplest valid JSON symbol file consists of just a triple and UUID: + +.. code-block:: JSON + + { + "triple": "arm64-apple-macosx15.0.0", + "uuid": "A711AB38-1FB1-38B1-B38B-859352ED2A20" + } + +A JSON symbol file with symbols for ``main``, ``foo``, and ``bar``. + +.. code-block:: JSON + + { + "triple": "arm64-apple-macosx15.0.0", + "uuid": "321C6225-2378-3E6D-B6C1-6374DEC6D81A", + "symbols": [ + { + "name": "main", + "type": "code", + "size": 32, + "address": 4294983552 + }, + { + "name": "foo", + "type": "code", + "size": 8, + "address": 4294983544 + }, + { + "name": "bar", + "type": "code", + "size": 0, + "value": 255 + } + ] + } + +A symbol file with a symbol ``foo`` belonging to the ``__TEXT`` section. + +.. code-block:: JSON + + { + "triple": "arm64-apple-macosx15.0.0", + "uuid": "58489DB0-F9FF-4E62-ABD1-A7CCE5DFB879", + "type": "sharedlibrary", + "sections": [ + { + "name": "__TEXT", + "type": "code", + "address": 0, + "size": 546 + } + ], + "symbols": [ + { + "name": "foo", + "address": 256, + "size": 17 + } + ] + } -- GitLab From 4c4b93dcb9d8f2400891ffbe79ff55dc9e70b71b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 13:28:24 -0700 Subject: [PATCH 110/511] [SLP][NFC]Add a test with the incorrect casting of freeze instruction operands, NFC --- .../SLPVectorizer/freeze-signedness-missed.ll | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll diff --git a/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll b/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll new file mode 100644 index 000000000000..3e593af86896 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define i32 @test(i1 %.b, i8 %conv18, i32 %k.promoted61) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i1 [[DOTB:%.*]], i8 [[CONV18:%.*]], i32 [[K_PROMOTED61:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[DOTB]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i8> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> poison, i8 [[CONV18]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <4 x i8> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i8> zeroinitializer, <4 x i8> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = sub nuw <4 x i8> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i8> [[TMP9]], <4 x i8> [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP15]], [[K_PROMOTED61]] +; CHECK-NEXT: ret i32 [[OP_RDX]] +; + %not..b79 = xor i1 %.b, true + %3 = zext i1 %not..b79 to i8 + %cmp.i51 = icmp eq i8 %3, 0 + %cond.i55 = freeze i8 %3 + %.cmp = icmp ugt i8 %cond.i55, %conv18 + %.urem = select i1 %.cmp, i8 0, i8 %cond.i55 + %4 = sub nuw i8 %conv18, %.urem + %cond.in.i = select i1 %cmp.i51, i8 %conv18, i8 %4 + %not..b80 = xor i1 %.b, true + %5 = zext i1 %not..b80 to i8 + %cmp.i51.1 = icmp eq i8 %5, 0 + %cond.i55.1 = freeze i8 %5 + %.cmp.1 = icmp ugt i8 %cond.i55.1, %conv18 + %.urem.1 = select i1 %.cmp.1, i8 0, i8 %cond.i55.1 + %6 = sub nuw i8 %conv18, %.urem.1 + %cond.in.i.1 = select i1 %cmp.i51.1, i8 %conv18, i8 %6 + %not..b81 = xor i1 %.b, true + %7 = zext i1 %not..b81 to i8 + %cmp.i51.2 = icmp eq i8 %7, 0 + %cond.i55.2 = freeze i8 %7 + %.cmp.2 = icmp ugt i8 %cond.i55.2, %conv18 + %.urem.2 = select i1 %.cmp.2, i8 0, i8 %cond.i55.2 + %8 = sub nuw i8 %conv18, %.urem.2 + %cond.in.i.2 = select i1 %cmp.i51.2, i8 %conv18, i8 %8 + %not..b = xor i1 %.b, true + %9 = zext i1 %not..b to i8 + %cmp.i51.3 = icmp eq i8 %9, 0 + %cond.i55.3 = freeze i8 %9 + %.cmp.3 = icmp ugt i8 %cond.i55.3, %conv18 + %.urem.3 = select i1 %.cmp.3, i8 0, i8 %cond.i55.3 + %10 = sub nuw i8 %conv18, %.urem.3 + %cond.in.i.3 = select i1 %cmp.i51.3, i8 %conv18, i8 %10 + %conv26 = zext nneg i8 %cond.in.i to i32 + %or = or i32 %k.promoted61, %conv26 + %conv26.1 = zext nneg i8 %cond.in.i.1 to i32 + %or.1 = or i32 %or, %conv26.1 + %conv26.2 = zext nneg i8 %cond.in.i.2 to i32 + %or.2 = or i32 %or.1, %conv26.2 + %conv26.3 = zext nneg i8 %cond.in.i.3 to i32 + %or.3 = or i32 %or.2, %conv26.3 + ret i32 %or.3 +} -- GitLab From 65cf7afb6d9d8c6137b90d909ee4fcf251439f48 Mon Sep 17 00:00:00 2001 From: OverMighty Date: Fri, 18 Oct 2024 22:35:12 +0200 Subject: [PATCH 111/511] [libc][math][c23] Add logf16 C23 math function (#106072) Part of #95250. --- libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 22 ++++ libc/src/math/generic/expxf16.h | 28 ++++ libc/src/math/generic/logf16.cpp | 157 +++++++++++++++++++++++ libc/src/math/logf16.h | 21 +++ libc/test/UnitTest/FPMatcher.h | 13 ++ libc/test/src/math/CMakeLists.txt | 11 ++ libc/test/src/math/logf16_test.cpp | 40 ++++++ libc/test/src/math/smoke/CMakeLists.txt | 13 ++ libc/test/src/math/smoke/logf16_test.cpp | 49 +++++++ 14 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 libc/src/math/generic/logf16.cpp create mode 100644 libc/src/math/logf16.h create mode 100644 libc/test/src/math/logf16_test.cpp create mode 100644 libc/test/src/math/smoke/logf16_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index d89093b2117c..d9df737efea3 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -568,6 +568,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llrintf16 libc.src.math.llroundf16 libc.src.math.logbf16 + libc.src.math.logf16 libc.src.math.lrintf16 libc.src.math.lroundf16 libc.src.math.modff16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 7314dbc660f3..3d9d5a9e984c 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -661,6 +661,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llrintf16 libc.src.math.llroundf16 libc.src.math.logbf16 + libc.src.math.logf16 libc.src.math.lrintf16 libc.src.math.lroundf16 libc.src.math.modff16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 010377a90f6e..a4c59190a01b 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -310,7 +310,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | lgamma | | | | | | 7.12.8.3 | F.10.5.3 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| log | |check| | |check| | | | | 7.12.6.11 | F.10.3.11 | +| log | |check| | |check| | | |check| | | 7.12.6.11 | F.10.3.11 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | log10 | |check| | |check| | | | | 7.12.6.12 | F.10.3.12 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 196dab9f81b3..cc49835ac7e1 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -651,6 +651,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"log", RetValSpec, [ArgSpec]>, FunctionSpec<"logf", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"logf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"logb", RetValSpec, [ArgSpec]>, FunctionSpec<"logbf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 8427b550ab4c..9c6646cd658e 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -343,6 +343,7 @@ add_math_entrypoint_object(log2f) add_math_entrypoint_object(log) add_math_entrypoint_object(logf) +add_math_entrypoint_object(logf16) add_math_entrypoint_object(logb) add_math_entrypoint_object(logbf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 81b3e44db792..b95672bc3968 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2289,6 +2289,28 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + logf16 + SRCS + logf16.cpp + HDRS + ../logf16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( logb SRCS diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h index 7202b1b11319..357421958b4d 100644 --- a/libc/src/math/generic/expxf16.h +++ b/libc/src/math/generic/expxf16.h @@ -288,6 +288,34 @@ template LIBC_INLINE float16 eval_sinh_or_cosh(float16 x) { lo, half_p_odd * exp2_hi_mid_diff, half_p_even * exp2_hi_mid_sum)); } +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 31 do print(round(log(1 + i * 2^-5), SG, RN)); +constexpr cpp::array LOGF_F = { + 0x0p+0f, 0x1.f829bp-6f, 0x1.f0a30cp-5f, 0x1.6f0d28p-4f, + 0x1.e27076p-4f, 0x1.29553p-3f, 0x1.5ff308p-3f, 0x1.9525aap-3f, + 0x1.c8ff7cp-3f, 0x1.fb9186p-3f, 0x1.1675cap-2f, 0x1.2e8e2cp-2f, + 0x1.4618bcp-2f, 0x1.5d1bdcp-2f, 0x1.739d8p-2f, 0x1.89a338p-2f, + 0x1.9f323ep-2f, 0x1.b44f78p-2f, 0x1.c8ff7cp-2f, 0x1.dd46ap-2f, + 0x1.f128f6p-2f, 0x1.02552ap-1f, 0x1.0be72ep-1f, 0x1.154c3ep-1f, + 0x1.1e85f6p-1f, 0x1.2795e2p-1f, 0x1.307d74p-1f, 0x1.393e0ep-1f, + 0x1.41d8fep-1f, 0x1.4a4f86p-1f, 0x1.52a2d2p-1f, 0x1.5ad404p-1f, +}; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 31 do print(round(1 / (1 + i * 2^-5), SG, RN)); +constexpr cpp::array ONE_OVER_F_F = { + 0x1p+0f, 0x1.f07c2p-1f, 0x1.e1e1e2p-1f, 0x1.d41d42p-1f, + 0x1.c71c72p-1f, 0x1.bacf92p-1f, 0x1.af286cp-1f, 0x1.a41a42p-1f, + 0x1.99999ap-1f, 0x1.8f9c18p-1f, 0x1.861862p-1f, 0x1.7d05f4p-1f, + 0x1.745d18p-1f, 0x1.6c16c2p-1f, 0x1.642c86p-1f, 0x1.5c9882p-1f, + 0x1.555556p-1f, 0x1.4e5e0ap-1f, 0x1.47ae14p-1f, 0x1.414142p-1f, + 0x1.3b13b2p-1f, 0x1.3521dp-1f, 0x1.2f684cp-1f, 0x1.29e412p-1f, + 0x1.24924ap-1f, 0x1.1f7048p-1f, 0x1.1a7b96p-1f, 0x1.15b1e6p-1f, + 0x1.111112p-1f, 0x1.0c9714p-1f, 0x1.08421p-1f, 0x1.041042p-1f, +}; + } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H diff --git a/libc/src/math/generic/logf16.cpp b/libc/src/math/generic/logf16.cpp new file mode 100644 index 000000000000..735fec9681dd --- /dev/null +++ b/libc/src/math/generic/logf16.cpp @@ -0,0 +1,157 @@ +//===-- Half-precision log(x) function ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/logf16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" + +namespace LIBC_NAMESPACE_DECL { + +#ifdef LIBC_TARGET_CPU_HAS_FMA +static constexpr size_t N_LOGF16_EXCEPTS = 5; +#else +static constexpr size_t N_LOGF16_EXCEPTS = 11; +#endif + +static constexpr fputil::ExceptValues + LOGF16_EXCEPTS = {{ +// (input, RZ output, RU offset, RD offset, RN offset) +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.61cp-13, logf16(x) = -0x1.16p+3 (RZ) + {0x0987U, 0xc858U, 0U, 1U, 0U}, + // x = 0x1.f2p-12, logf16(x) = -0x1.e98p+2 (RZ) + {0x0fc8U, 0xc7a6U, 0U, 1U, 1U}, +#endif + // x = 0x1.4d4p-9, logf16(x) = -0x1.7e4p+2 (RZ) + {0x1935U, 0xc5f9U, 0U, 1U, 0U}, + // x = 0x1.5ep-8, logf16(x) = -0x1.4ecp+2 (RZ) + {0x1d78U, 0xc53bU, 0U, 1U, 0U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.fdp-1, logf16(x) = -0x1.81p-8 (RZ) + {0x3bf4U, 0x9e04U, 0U, 1U, 1U}, + // x = 0x1.fep-1, logf16(x) = -0x1.008p-8 (RZ) + {0x3bf8U, 0x9c02U, 0U, 1U, 0U}, +#endif + // x = 0x1.ffp-1, logf16(x) = -0x1.004p-9 (RZ) + {0x3bfcU, 0x9801U, 0U, 1U, 0U}, + // x = 0x1.ff8p-1, logf16(x) = -0x1p-10 (RZ) + {0x3bfeU, 0x9400U, 0U, 1U, 1U}, +#ifdef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.4c4p+1, logf16(x) = 0x1.e84p-1 (RZ) + {0x4131U, 0x3ba1U, 1U, 0U, 1U}, +#else + // x = 0x1.75p+2, logf16(x) = 0x1.c34p+0 (RZ) + {0x45d4U, 0x3f0dU, 1U, 0U, 0U}, + // x = 0x1.75p+2, logf16(x) = 0x1.c34p+0 (RZ) + {0x45d4U, 0x3f0dU, 1U, 0U, 0U}, + // x = 0x1.d5p+9, logf16(x) = 0x1.b5cp+2 (RZ) + {0x6354U, 0x46d7U, 1U, 0U, 1U}, +#endif + }}; + +LLVM_LIBC_FUNCTION(float16, logf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + + // If x <= 0, or x is 1, or x is +inf, or x is NaN. + if (LIBC_UNLIKELY(x_u == 0U || x_u == 0x3c00U || x_u >= 0x7c00U)) { + // log(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // log(+/-0) = −inf + if ((x_u & 0x7fffU) == 0U) { + fputil::raise_except_if_required(FE_DIVBYZERO); + return FPBits::inf(Sign::NEG).get_val(); + } + + if (x_u == 0x3c00U) + return FPBits::zero().get_val(); + + // When x < 0. + if (x_u > 0x8000U) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // log(+inf) = +inf + return FPBits::inf().get_val(); + } + + if (auto r = LOGF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // To compute log(x), we perform the following range reduction: + // x = 2^m * 1.mant, + // log(x) = m * log(2) + log(1.mant). + // To compute log(1.mant), let f be the highest 6 bits including the hidden + // bit, and d be the difference (1.mant - f), i.e., the remaining 5 bits of + // the mantissa, then: + // log(1.mant) = log(f) + log(1.mant / f) + // = log(f) + log(1 + d/f) + // since d/f is sufficiently small. + // We store log(f) and 1/f in the lookup tables LOGF_F and ONE_OVER_F + // respectively. + + int m = -FPBits::EXP_BIAS; + + // When x is subnormal, normalize it. + if ((x_u & FPBits::EXP_MASK) == 0U) { + // Can't pass an integer to fputil::cast directly. + constexpr float NORMALIZE_EXP = 1U << FPBits::FRACTION_LEN; + x_bits = FPBits(x_bits.get_val() * fputil::cast(NORMALIZE_EXP)); + x_u = x_bits.uintval(); + m -= FPBits::FRACTION_LEN; + } + + uint16_t mant = x_bits.get_mantissa(); + // Leading 10 - 5 = 5 bits of the mantissa. + int f = mant >> 5; + // Unbiased exponent. + m += x_u >> FPBits::FRACTION_LEN; + + // Set bits to 1.mant instead of 2^m * 1.mant. + x_bits.set_biased_exponent(FPBits::EXP_BIAS); + float mant_f = x_bits.get_val(); + // v = 1.mant * 1/f - 1 = d/f + float v = fputil::multiply_add(mant_f, ONE_OVER_F_F[f], -1.0f); + + // Degree-3 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax(log(1 + x)/x, 2, [|SG...|], [-2^-5, 2^-5]); + // > x * P; + float log1p_d_over_f = + v * fputil::polyeval(v, 0x1p+0f, -0x1.001804p-1f, 0x1.557ef6p-2f); + // log(1.mant) = log(f) + log(1 + d/f) + float log_1_mant = LOGF_F[f] + log1p_d_over_f; + return fputil::cast( + fputil::multiply_add(static_cast(m), LOGF_2, log_1_mant)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/logf16.h b/libc/src/math/logf16.h new file mode 100644 index 000000000000..e2d296b1d908 --- /dev/null +++ b/libc/src/math/logf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for logf16 ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_LOGF16_H +#define LLVM_LIBC_SRC_MATH_LOGF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 logf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_LOGF16_H diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index e1a33ea326ec..bdcc22ef94e7 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -400,4 +400,17 @@ private: EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE( \ (expected), (actual), (expected_except), RoundingMode::TowardZero) +#define EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(expected, actual, \ + expected_except) \ + do { \ + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST((expected), (actual), \ + (expected_except)); \ + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD((expected), (actual), \ + (expected_except)); \ + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD((expected), (actual), \ + (expected_except)); \ + EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO((expected), (actual), \ + (expected_except)); \ + } while (0) + #endif // LLVM_LIBC_TEST_UNITTEST_FPMATCHER_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 11342e6dfa04..2d935f588488 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1772,6 +1772,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + logf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + logf16_test.cpp + DEPENDS + libc.src.math.logf16 +) + add_fp_unittest( log2_test NEED_MPFR diff --git a/libc/test/src/math/logf16_test.cpp b/libc/test/src/math/logf16_test.cpp new file mode 100644 index 000000000000..922918b092b2 --- /dev/null +++ b/libc/test/src/math/logf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for logf16 ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/logf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcLogf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x, + LIBC_NAMESPACE::logf16(x), 0.5); + } +} + +TEST_F(LlvmLibcLogf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x, + LIBC_NAMESPACE::logf16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 899c9d2df453..a3cd671269ca 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3558,6 +3558,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + logf16_test + SUITE + libc-math-smoke-tests + SRCS + logf16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.logf16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( log2_test SUITE diff --git a/libc/test/src/math/smoke/logf16_test.cpp b/libc/test/src/math/smoke/logf16_test.cpp new file mode 100644 index 000000000000..c7232aa1c1e3 --- /dev/null +++ b/libc/test/src/math/smoke/logf16_test.cpp @@ -0,0 +1,49 @@ +//===-- Unittests for logf16 ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/logf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcLogf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::logf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::logf16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::logf16(zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::logf16(neg_zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + zero, LIBC_NAMESPACE::logf16(LIBC_NAMESPACE::fputil::cast(1.0))); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + aNaN, + LIBC_NAMESPACE::logf16(LIBC_NAMESPACE::fputil::cast(-1.0))); + EXPECT_MATH_ERRNO(EDOM); +} -- GitLab From e56e9dd8adca2e86f22783bf5e745ee1ba7ead5f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 13:32:41 -0700 Subject: [PATCH 112/511] [SLP]Fix minbitwidth emission and analysis for freeze instruction Need to add minbw emission and analysis for freeze instruction to fix incorrect signedness propagation. Fixes #112460 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++++++ .../Transforms/SLPVectorizer/freeze-signedness-missed.ll | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 53632efe913e..e7b52fbdcc3b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14953,6 +14953,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { return E->VectorizedValue; } + if (Op->getType() != VecTy) { + assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || + MinBWs.contains(getOperandEntry(E, 0))) && + "Expected item in MinBWs."); + Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0)); + } Value *V = Builder.CreateFreeze(Op); V = FinalShuffle(V, E); @@ -17095,6 +17101,8 @@ bool BoUpSLP::collectValuesToDemote( return TryProcessInstruction( BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}); } + case Instruction::Freeze: + return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0)); case Instruction::Shl: { // If we are truncating the result of this SHL, and if it's a shift of an // inrange amount, we can always perform a SHL in a smaller type. diff --git a/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll b/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll index 3e593af86896..6cd44c297882 100644 --- a/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll +++ b/llvm/test/Transforms/SLPVectorizer/freeze-signedness-missed.ll @@ -9,8 +9,8 @@ define i32 @test(i1 %.b, i8 %conv18, i32 %k.promoted61) { ; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i8> [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> poison, i8 [[CONV18]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <4 x i8> [[TMP7]], [[TMP9]] -- GitLab From 825f9cb1b31aa91d23eba803003897490de74a20 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 13:44:57 -0700 Subject: [PATCH 113/511] [SLP][NFC]Add a test with the incorrect casting of the abs argument, NFC --- .../abs-overflow-incorrect-minbws.ll | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll diff --git a/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll b/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll new file mode 100644 index 000000000000..a936b076138d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s + +define i32 @test(i32 %n) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[N]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[RES1:%.*]] = add i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: ret i32 [[RES1]] +; +entry: + %n1 = add i32 %n, 1 + %zn1 = zext nneg i32 %n1 to i64 + %m1 = mul nuw nsw i64 %zn1, 273837369 + %a1 = call i64 @llvm.abs.i64(i64 %m1, i1 true) + %t1 = trunc i64 %a1 to i32 + %n2 = add i32 %n, 2 + %zn2 = zext nneg i32 %n2 to i64 + %m2 = mul nuw nsw i64 %zn2, 273837369 + %a2 = call i64 @llvm.abs.i64(i64 %m2, i1 true) + %t2 = trunc i64 %a2 to i32 + %res1 = add i32 %t1, %t2 + ret i32 %res1 +} -- GitLab From 76196998e25b98d81abc437708622261810782ca Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Fri, 18 Oct 2024 21:49:23 +0100 Subject: [PATCH 114/511] [clang] Make LazyOffsetPtr more portable (#112927) LazyOffsetPtr currently relies on uint64_t being able to store a pointer and, unless sizeof(uint64_t) == sizeof(void *), little endianness, since getAddressOfPointer reinterprets the memory as a pointer. This also doesn't properly respect the C++ object model. As removing getAddressOfPointer would have wide-reaching implications, improve the implementation to account for these problems by using placement new and a suitably sized-and-aligned buffer, "right"-aligning the objects on big-endian platforms so the LSBs are in the same place for use as the discriminator. Fixes: bc73ef0031b50f7443615fef614fb4ecaaa4bd11 Fixes: https://github.com/llvm/llvm-project/issues/111993 --- clang/include/clang/AST/ExternalASTSource.h | 48 +++++++++++++++------ 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 385c32edbae0..582ed7c65f58 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -25,10 +25,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/PointerLikeTypeTraits.h" +#include #include #include #include #include +#include #include #include @@ -326,29 +328,49 @@ struct LazyOffsetPtr { /// /// If the low bit is clear, a pointer to the AST node. If the low /// bit is set, the upper 63 bits are the offset. - mutable uint64_t Ptr = 0; + static constexpr size_t DataSize = std::max(sizeof(uint64_t), sizeof(T *)); + alignas(uint64_t) alignas(T *) mutable unsigned char Data[DataSize] = {}; + + unsigned char GetLSB() const { + return Data[llvm::sys::IsBigEndianHost ? DataSize - 1 : 0]; + } + + template U &As(bool New) const { + unsigned char *Obj = + Data + (llvm::sys::IsBigEndianHost ? DataSize - sizeof(U) : 0); + if (New) + return *new (Obj) U; + return *std::launder(reinterpret_cast(Obj)); + } + + T *&GetPtr() const { return As(false); } + uint64_t &GetU64() const { return As(false); } + void SetPtr(T *Ptr) const { As(true) = Ptr; } + void SetU64(uint64_t U64) const { As(true) = U64; } public: LazyOffsetPtr() = default; - explicit LazyOffsetPtr(T *Ptr) : Ptr(reinterpret_cast(Ptr)) {} + explicit LazyOffsetPtr(T *Ptr) : Data() { SetPtr(Ptr); } - explicit LazyOffsetPtr(uint64_t Offset) : Ptr((Offset << 1) | 0x01) { + explicit LazyOffsetPtr(uint64_t Offset) : Data() { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); + else + SetU64((Offset << 1) | 0x01); } LazyOffsetPtr &operator=(T *Ptr) { - this->Ptr = reinterpret_cast(Ptr); + SetPtr(Ptr); return *this; } LazyOffsetPtr &operator=(uint64_t Offset) { assert((Offset << 1 >> 1) == Offset && "Offsets must require < 63 bits"); if (Offset == 0) - Ptr = 0; + SetPtr(nullptr); else - Ptr = (Offset << 1) | 0x01; + SetU64((Offset << 1) | 0x01); return *this; } @@ -356,15 +378,15 @@ public: /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - explicit operator bool() const { return Ptr != 0; } + explicit operator bool() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is non-NULL. /// /// This operation does not require the AST node to be deserialized. - bool isValid() const { return Ptr != 0; } + bool isValid() const { return isOffset() || GetPtr() != nullptr; } /// Whether this pointer is currently stored as an offset. - bool isOffset() const { return Ptr & 0x01; } + bool isOffset() const { return GetLSB() & 0x01; } /// Retrieve the pointer to the AST node that this lazy pointer points to. /// @@ -375,9 +397,9 @@ public: if (isOffset()) { assert(Source && "Cannot deserialize a lazy pointer without an AST source"); - Ptr = reinterpret_cast((Source->*Get)(OffsT(Ptr >> 1))); + SetPtr((Source->*Get)(OffsT(GetU64() >> 1))); } - return reinterpret_cast(Ptr); + return GetPtr(); } /// Retrieve the address of the AST node pointer. Deserializes the pointee if @@ -385,7 +407,7 @@ public: T **getAddressOfPointer(ExternalASTSource *Source) const { // Ensure the integer is in pointer form. (void)get(Source); - return reinterpret_cast(&Ptr); + return &GetPtr(); } }; -- GitLab From 709abacdc350d63c61888607edb28ce272daa0a0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 13:54:30 -0700 Subject: [PATCH 115/511] [SLP]Check that operand of abs does not overflow before making it part of minbitwidth transformation Need to check that the operand of the abs intrinsic can be safely truncated before making it part of the minbitwidth transformation. Fixes #112577 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++++++++++ .../abs-overflow-incorrect-minbws.ll | 6 ++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e7b52fbdcc3b..e1aa6127ac03 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17224,9 +17224,25 @@ bool BoUpSLP::collectValuesToDemote( MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL))); }); }; + auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { + assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); + return all_of(E.Scalars, [&](Value *V) { + auto *I = cast(V); + unsigned SignBits = OrigBitWidth - BitWidth; + APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); + unsigned Op0SignBits = + ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT); + return SignBits <= Op0SignBits && + ((SignBits != Op0SignBits && + !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) || + MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL))); + }); + }; if (ID != Intrinsic::abs) { Operands.push_back(getOperandEntry(&E, 1)); CallChecker = CompChecker; + } else { + CallChecker = AbsChecker; } InstructionCost BestCost = std::numeric_limits::max(); diff --git a/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll b/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll index a936b076138d..51b635837d3b 100644 --- a/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll +++ b/llvm/test/Transforms/SLPVectorizer/abs-overflow-incorrect-minbws.ll @@ -8,8 +8,10 @@ define i32 @test(i32 %n) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[N]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw <2 x i64> [[TMP3]], +; CHECK-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 ; CHECK-NEXT: [[RES1:%.*]] = add i32 [[TMP5]], [[TMP6]] -- GitLab From 47d9ca87b0385975e8b14f5df06886ddd6057b17 Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Sat, 19 Oct 2024 04:11:23 +0700 Subject: [PATCH 116/511] [lldb] Fix and re-enable TestUseSourceCache.py (#111237) The decorators caused the `test_set_use_source_cache_true()` test to be skipped in most scenarios. It was only run on a Windows host targeting a non-Windows remote platform. The source file is opened with the `FILE_SHARE_DELETE` sharing mode, which allows the file to be removed even though it is also memory-mapped; at least, this behavior is observed on Windows 11. The patch replaces the operation with an attempt to overwrite the file, which still fails for such files on Windows 11. --- .../use_source_cache/TestUseSourceCache.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py index 421599080a9e..8425ab09ab9d 100644 --- a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py +++ b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py @@ -18,9 +18,8 @@ class SettingsUseSourceCacheTestCase(TestBase): self.set_use_source_cache_and_test(False) @skipIf(hostoslist=no_match(["windows"])) - @skipIf(oslist=["windows"]) # Fails on windows 11 def test_set_use_source_cache_true(self): - """Test that after 'set use-source-cache false', files are locked.""" + """Test that after 'set use-source-cache true', files are locked.""" self.set_use_source_cache_and_test(True) def set_use_source_cache_and_test(self, is_cache_enabled): @@ -46,23 +45,27 @@ class SettingsUseSourceCacheTestCase(TestBase): # Show the source file contents to make sure LLDB loads src file. self.runCmd("source list") - # Try deleting the source file. - is_file_removed = self.removeFile(src) + # Try overwriting the source file. + is_file_overwritten = self.overwriteFile(src) if is_cache_enabled: self.assertFalse( - is_file_removed, "Source cache is enabled, but delete file succeeded" + is_file_overwritten, + "Source cache is enabled, but writing to file succeeded", ) if not is_cache_enabled: self.assertTrue( - is_file_removed, "Source cache is disabled, but delete file failed" + is_file_overwritten, + "Source cache is disabled, but writing to file failed", ) - def removeFile(self, src): - """Remove file and return true iff file was successfully removed.""" + def overwriteFile(self, src): + """Write to file and return true iff file was successfully written.""" try: - os.remove(src) + f = open(src, "w") + f.writelines(["// hello world\n"]) + f.close() return True except Exception: return False -- GitLab From b88d94caba518bc63c25fe476c4de3d9b0bbd2c0 Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Sat, 19 Oct 2024 04:13:27 +0700 Subject: [PATCH 117/511] [lldb] Speed up FindInMemory tests (#111951) A memory region can be relatively large. Searching for a value in the entire region is time-consuming, especially when running tests against a remote target, because the memory data is transferred in small chunks over a relatively slow GDB Remote Protocol. The patch limits the address range to be searched to 2K, which seems sufficient for these tests. In my setup, for local runs, these tests now take half the time they did before the patch. For a remote target, the improvement is even more significant. --- .../find_in_memory/TestFindInMemory.py | 10 ++-- .../find_in_memory/TestFindRangesInMemory.py | 16 +++--- .../find_in_memory/address_ranges_helper.py | 50 ++++++++++++------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py index 9ab4619b1f8f..04e807c5c620 100644 --- a/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py +++ b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py @@ -55,7 +55,7 @@ class FindInMemoryTestCase(TestBase): error = lldb.SBError() addr = self.process.FindInMemory( SINGLE_INSTANCE_PATTERN_STACK, - GetStackRange(self), + GetStackRange(self, True), 1, error, ) @@ -70,7 +70,7 @@ class FindInMemoryTestCase(TestBase): error = lldb.SBError() addr = self.process.FindInMemory( DOUBLE_INSTANCE_PATTERN_HEAP, - GetHeapRanges(self)[0], + GetHeapRanges(self, True)[0], 1, error, ) @@ -86,7 +86,7 @@ class FindInMemoryTestCase(TestBase): error = lldb.SBError() addr = self.process.FindInMemory( SINGLE_INSTANCE_PATTERN_STACK, - GetStackRange(self), + GetStackRange(self, True), 0, error, ) @@ -118,7 +118,7 @@ class FindInMemoryTestCase(TestBase): error = lldb.SBError() addr = self.process.FindInMemory( "", - GetStackRange(self), + GetStackRange(self, True), 1, error, ) @@ -131,7 +131,7 @@ class FindInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) error = lldb.SBError() - range = GetAlignedRange(self) + range = GetAlignedRange(self, True) # First we make sure the pattern is found with alignment 1 addr = self.process.FindInMemory( diff --git a/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py b/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py index 31bc0e99f491..895c527430f2 100644 --- a/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py +++ b/lldb/test/API/python_api/find_in_memory/TestFindRangesInMemory.py @@ -30,7 +30,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetHeapRanges(self) + addr_ranges = GetHeapRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( DOUBLE_INSTANCE_PATTERN_HEAP, @@ -48,7 +48,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetStackRanges(self) + addr_ranges = GetStackRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( SINGLE_INSTANCE_PATTERN_STACK, @@ -66,7 +66,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetRanges(self) + addr_ranges = GetRanges(self, True) addr_ranges.Append(lldb.SBAddressRange()) self.assertGreater(addr_ranges.GetSize(), 2) error = lldb.SBError() @@ -86,7 +86,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetHeapRanges(self) + addr_ranges = GetHeapRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( DOUBLE_INSTANCE_PATTERN_HEAP, @@ -104,7 +104,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetHeapRanges(self) + addr_ranges = GetHeapRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( DOUBLE_INSTANCE_PATTERN_HEAP, @@ -160,7 +160,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetHeapRanges(self) + addr_ranges = GetHeapRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( "", @@ -178,7 +178,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertTrue(self.process, PROCESS_IS_VALID) self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) - addr_ranges = GetHeapRanges(self) + addr_ranges = GetHeapRanges(self, True) error = lldb.SBError() matches = self.process.FindRangesInMemory( DOUBLE_INSTANCE_PATTERN_HEAP, @@ -197,7 +197,7 @@ class FindRangesInMemoryTestCase(TestBase): self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED) addr_ranges = lldb.SBAddressRangeList() - addr_ranges.Append(GetAlignedRange(self)) + addr_ranges.Append(GetAlignedRange(self, True)) error = lldb.SBError() matches = self.process.FindRangesInMemory( diff --git a/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py index 810fb9fee386..dcceca6d8a5c 100644 --- a/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py +++ b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py @@ -6,27 +6,30 @@ ALIGNED_INSTANCE_PATTERN_HEAP = "i_am_unaligned_string_on_the_heap" UNALIGNED_INSTANCE_PATTERN_HEAP = ALIGNED_INSTANCE_PATTERN_HEAP[1:] -def GetAlignedRange(test_base): +def GetAlignedRange(test_base, shrink=False): frame = test_base.thread.GetSelectedFrame() ex = frame.EvaluateExpression("aligned_string_ptr") test_base.assertTrue(ex.IsValid()) - return GetRangeFromAddrValue(test_base, ex) + return GetRangeFromAddrValue(test_base, ex, shrink) -def GetStackRange(test_base): +def GetStackRange(test_base, shrink=False): frame = test_base.thread.GetSelectedFrame() ex = frame.EvaluateExpression("&stack_pointer") test_base.assertTrue(ex.IsValid()) - return GetRangeFromAddrValue(test_base, ex) + return GetRangeFromAddrValue(test_base, ex, shrink) -def GetStackRanges(test_base): +def GetStackRanges(test_base, shrink=False): addr_ranges = lldb.SBAddressRangeList() addr_ranges.Append(GetStackRange(test_base)) return addr_ranges -def GetRangeFromAddrValue(test_base, addr): +def GetRangeFromAddrValue(test_base, addr, shrink=False): + """Returns a memory region containing 'addr'. + If 'shrink' is True, the address range will be reduced to not exceed 2K. + """ region = lldb.SBMemoryRegionInfo() test_base.assertTrue( test_base.process.GetMemoryRegionInfo( @@ -37,37 +40,48 @@ def GetRangeFromAddrValue(test_base, addr): test_base.assertTrue(region.IsReadable()) test_base.assertFalse(region.IsExecutable()) - address_start = lldb.SBAddress(region.GetRegionBase(), test_base.target) - stack_size = region.GetRegionEnd() - region.GetRegionBase() - return lldb.SBAddressRange(address_start, stack_size) + base = region.GetRegionBase() + end = region.GetRegionEnd() + if shrink: + addr2 = addr.GetValueAsUnsigned() + addr2 -= addr2 % 512 + base = max(base, addr2 - 1024) + end = min(end, addr2 + 1024) -def IsWithinRange(addr, range, target): + start = lldb.SBAddress(base, test_base.target) + size = end - base + + return lldb.SBAddressRange(start, size) + + +def IsWithinRange(addr, size, range, target): start_addr = range.GetBaseAddress().GetLoadAddress(target) end_addr = start_addr + range.GetByteSize() addr = addr.GetValueAsUnsigned() - return addr >= start_addr and addr < end_addr + return addr >= start_addr and addr + size <= end_addr -def GetHeapRanges(test_base): +def GetHeapRanges(test_base, shrink=False): frame = test_base.thread.GetSelectedFrame() ex = frame.EvaluateExpression("heap_pointer1") test_base.assertTrue(ex.IsValid()) - range = GetRangeFromAddrValue(test_base, ex) + range = GetRangeFromAddrValue(test_base, ex, shrink) addr_ranges = lldb.SBAddressRangeList() addr_ranges.Append(range) ex = frame.EvaluateExpression("heap_pointer2") test_base.assertTrue(ex.IsValid()) - if not IsWithinRange(ex, addr_ranges[0], test_base.target): - addr_ranges.Append(GetRangeFromAddrValue(test_base, ex)) + size = len(DOUBLE_INSTANCE_PATTERN_HEAP) + if not IsWithinRange(ex, size, addr_ranges[0], test_base.target): + addr_ranges.Append(GetRangeFromAddrValue(test_base, ex, shrink)) return addr_ranges -def GetRanges(test_base): - ranges = GetHeapRanges(test_base) - ranges.Append(GetStackRanges(test_base)) +def GetRanges(test_base, shrink=False): + ranges = GetHeapRanges(test_base, shrink) + ranges.Append(GetStackRanges(test_base, shrink)) return ranges -- GitLab From 6e02e19cd382f1524eaedd374ac33872cb565f67 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 18 Oct 2024 14:22:27 -0700 Subject: [PATCH 118/511] [lldb][docs] Add JSON symbol file docs to the ToC --- lldb/docs/index.rst | 1 + lldb/docs/{resources => use}/symbolfilejson.rst | 0 2 files changed, 1 insertion(+) rename lldb/docs/{resources => use}/symbolfilejson.rst (100%) diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst index b91077d66089..e2c15d872b4b 100644 --- a/lldb/docs/index.rst +++ b/lldb/docs/index.rst @@ -134,6 +134,7 @@ interesting areas to contribute to lldb. use/intel_pt use/ondemand use/aarch64-linux + use/symbolfilejson use/troubleshooting use/links Man Page diff --git a/lldb/docs/resources/symbolfilejson.rst b/lldb/docs/use/symbolfilejson.rst similarity index 100% rename from lldb/docs/resources/symbolfilejson.rst rename to lldb/docs/use/symbolfilejson.rst -- GitLab From 659192b1843c4af180700783caca4cdc7afa3eab Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 18 Oct 2024 14:26:57 -0700 Subject: [PATCH 119/511] [NFC][MLIR][TableGen] Eliminate `llvm::` for commonly used types (#112456) Eliminate `llvm::` namespace qualifier for commonly used types in MLIR TableGen backends to reduce code clutter. --- mlir/lib/TableGen/AttrOrTypeDef.cpp | 81 ++++++++--------- mlir/lib/TableGen/Attribute.cpp | 25 +++--- mlir/lib/TableGen/Builder.cpp | 24 ++--- mlir/lib/TableGen/CodeGenHelpers.cpp | 18 ++-- mlir/lib/TableGen/Interfaces.cpp | 30 ++++--- mlir/lib/TableGen/Operator.cpp | 38 ++++---- mlir/lib/TableGen/Pattern.cpp | 126 ++++++++++++++------------- mlir/lib/TableGen/Predicate.cpp | 23 ++--- mlir/lib/TableGen/Type.cpp | 5 +- 9 files changed, 193 insertions(+), 177 deletions(-) diff --git a/mlir/lib/TableGen/AttrOrTypeDef.cpp b/mlir/lib/TableGen/AttrOrTypeDef.cpp index e72ca155bcf7..9e8f789d71b5 100644 --- a/mlir/lib/TableGen/AttrOrTypeDef.cpp +++ b/mlir/lib/TableGen/AttrOrTypeDef.cpp @@ -17,6 +17,12 @@ using namespace mlir; using namespace mlir::tblgen; +using llvm::DefInit; +using llvm::Init; +using llvm::ListInit; +using llvm::Record; +using llvm::RecordVal; +using llvm::StringInit; //===----------------------------------------------------------------------===// // AttrOrTypeBuilder @@ -35,14 +41,13 @@ bool AttrOrTypeBuilder::hasInferredContextParameter() const { // AttrOrTypeDef //===----------------------------------------------------------------------===// -AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) { +AttrOrTypeDef::AttrOrTypeDef(const Record *def) : def(def) { // Populate the builders. - auto *builderList = - dyn_cast_or_null(def->getValueInit("builders")); + const auto *builderList = + dyn_cast_or_null(def->getValueInit("builders")); if (builderList && !builderList->empty()) { - for (const llvm::Init *init : builderList->getValues()) { - AttrOrTypeBuilder builder(cast(init)->getDef(), - def->getLoc()); + for (const Init *init : builderList->getValues()) { + AttrOrTypeBuilder builder(cast(init)->getDef(), def->getLoc()); // Ensure that all parameters have names. for (const AttrOrTypeBuilder::Parameter ¶m : @@ -56,16 +61,16 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) { // Populate the traits. if (auto *traitList = def->getValueAsListInit("traits")) { - SmallPtrSet traitSet; + SmallPtrSet traitSet; traits.reserve(traitSet.size()); - llvm::unique_function processTraitList = - [&](const llvm::ListInit *traitList) { + llvm::unique_function processTraitList = + [&](const ListInit *traitList) { for (auto *traitInit : *traitList) { if (!traitSet.insert(traitInit).second) continue; // If this is an interface, add any bases to the trait list. - auto *traitDef = cast(traitInit)->getDef(); + auto *traitDef = cast(traitInit)->getDef(); if (traitDef->isSubClassOf("Interface")) { if (auto *bases = traitDef->getValueAsListInit("baseInterfaces")) processTraitList(bases); @@ -111,7 +116,7 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) { } Dialect AttrOrTypeDef::getDialect() const { - auto *dialect = dyn_cast(def->getValue("dialect")->getValue()); + const auto *dialect = dyn_cast(def->getValue("dialect")->getValue()); return Dialect(dialect ? dialect->getDef() : nullptr); } @@ -126,8 +131,8 @@ StringRef AttrOrTypeDef::getCppBaseClassName() const { } bool AttrOrTypeDef::hasDescription() const { - const llvm::RecordVal *desc = def->getValue("description"); - return desc && isa(desc->getValue()); + const RecordVal *desc = def->getValue("description"); + return desc && isa(desc->getValue()); } StringRef AttrOrTypeDef::getDescription() const { @@ -135,8 +140,8 @@ StringRef AttrOrTypeDef::getDescription() const { } bool AttrOrTypeDef::hasSummary() const { - const llvm::RecordVal *summary = def->getValue("summary"); - return summary && isa(summary->getValue()); + const RecordVal *summary = def->getValue("summary"); + return summary && isa(summary->getValue()); } StringRef AttrOrTypeDef::getSummary() const { @@ -249,9 +254,9 @@ StringRef TypeDef::getTypeName() const { template auto AttrOrTypeParameter::getDefValue(StringRef name) const { std::optional().getValue())> result; - if (auto *param = dyn_cast(getDef())) - if (auto *init = param->getDef()->getValue(name)) - if (auto *value = dyn_cast_or_null(init->getValue())) + if (const auto *param = dyn_cast(getDef())) + if (const auto *init = param->getDef()->getValue(name)) + if (const auto *value = dyn_cast_or_null(init->getValue())) result = value->getValue(); return result; } @@ -270,20 +275,20 @@ std::string AttrOrTypeParameter::getAccessorName() const { } std::optional AttrOrTypeParameter::getAllocator() const { - return getDefValue("allocator"); + return getDefValue("allocator"); } StringRef AttrOrTypeParameter::getComparator() const { - return getDefValue("comparator").value_or("$_lhs == $_rhs"); + return getDefValue("comparator").value_or("$_lhs == $_rhs"); } StringRef AttrOrTypeParameter::getCppType() const { - if (auto *stringType = dyn_cast(getDef())) + if (auto *stringType = dyn_cast(getDef())) return stringType->getValue(); - auto cppType = getDefValue("cppType"); + auto cppType = getDefValue("cppType"); if (cppType) return *cppType; - if (auto *init = dyn_cast(getDef())) + if (const auto *init = dyn_cast(getDef())) llvm::PrintFatalError( init->getDef()->getLoc(), Twine("Missing `cppType` field in Attribute/Type parameter: ") + @@ -295,34 +300,33 @@ StringRef AttrOrTypeParameter::getCppType() const { } StringRef AttrOrTypeParameter::getCppAccessorType() const { - return getDefValue("cppAccessorType") - .value_or(getCppType()); + return getDefValue("cppAccessorType").value_or(getCppType()); } StringRef AttrOrTypeParameter::getCppStorageType() const { - return getDefValue("cppStorageType").value_or(getCppType()); + return getDefValue("cppStorageType").value_or(getCppType()); } StringRef AttrOrTypeParameter::getConvertFromStorage() const { - return getDefValue("convertFromStorage").value_or("$_self"); + return getDefValue("convertFromStorage").value_or("$_self"); } std::optional AttrOrTypeParameter::getParser() const { - return getDefValue("parser"); + return getDefValue("parser"); } std::optional AttrOrTypeParameter::getPrinter() const { - return getDefValue("printer"); + return getDefValue("printer"); } std::optional AttrOrTypeParameter::getSummary() const { - return getDefValue("summary"); + return getDefValue("summary"); } StringRef AttrOrTypeParameter::getSyntax() const { - if (auto *stringType = dyn_cast(getDef())) + if (auto *stringType = dyn_cast(getDef())) return stringType->getValue(); - return getDefValue("syntax").value_or(getCppType()); + return getDefValue("syntax").value_or(getCppType()); } bool AttrOrTypeParameter::isOptional() const { @@ -330,17 +334,14 @@ bool AttrOrTypeParameter::isOptional() const { } std::optional AttrOrTypeParameter::getDefaultValue() const { - std::optional result = - getDefValue("defaultValue"); + std::optional result = getDefValue("defaultValue"); return result && !result->empty() ? result : std::nullopt; } -const llvm::Init *AttrOrTypeParameter::getDef() const { - return def->getArg(index); -} +const Init *AttrOrTypeParameter::getDef() const { return def->getArg(index); } std::optional AttrOrTypeParameter::getConstraint() const { - if (auto *param = dyn_cast(getDef())) + if (const auto *param = dyn_cast(getDef())) if (param->getDef()->isSubClassOf("Constraint")) return Constraint(param->getDef()); return std::nullopt; @@ -351,8 +352,8 @@ std::optional AttrOrTypeParameter::getConstraint() const { //===----------------------------------------------------------------------===// bool AttributeSelfTypeParameter::classof(const AttrOrTypeParameter *param) { - const llvm::Init *paramDef = param->getDef(); - if (auto *paramDefInit = dyn_cast(paramDef)) + const Init *paramDef = param->getDef(); + if (const auto *paramDefInit = dyn_cast(paramDef)) return paramDefInit->getDef()->isSubClassOf("AttributeSelfTypeParameter"); return false; } diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp index 887553bca661..f9fc58a40f33 100644 --- a/mlir/lib/TableGen/Attribute.cpp +++ b/mlir/lib/TableGen/Attribute.cpp @@ -71,7 +71,7 @@ StringRef Attribute::getReturnType() const { // Return the type constraint corresponding to the type of this attribute, or // std::nullopt if this is not a TypedAttr. std::optional Attribute::getValueType() const { - if (auto *defInit = dyn_cast(def->getValueInit("valueType"))) + if (const auto *defInit = dyn_cast(def->getValueInit("valueType"))) return Type(defInit->getDef()); return std::nullopt; } @@ -92,8 +92,7 @@ StringRef Attribute::getConstBuilderTemplate() const { } Attribute Attribute::getBaseAttr() const { - if (const auto *defInit = - llvm::dyn_cast(def->getValueInit("baseAttr"))) { + if (const auto *defInit = dyn_cast(def->getValueInit("baseAttr"))) { return Attribute(defInit).getBaseAttr(); } return *this; @@ -132,7 +131,7 @@ Dialect Attribute::getDialect() const { return Dialect(nullptr); } -const llvm::Record &Attribute::getDef() const { return *def; } +const Record &Attribute::getDef() const { return *def; } ConstantAttr::ConstantAttr(const DefInit *init) : def(init->getDef()) { assert(def->isSubClassOf("ConstantAttr") && @@ -147,12 +146,12 @@ StringRef ConstantAttr::getConstantValue() const { return def->getValueAsString("value"); } -EnumAttrCase::EnumAttrCase(const llvm::Record *record) : Attribute(record) { +EnumAttrCase::EnumAttrCase(const Record *record) : Attribute(record) { assert(isSubClassOf("EnumAttrCaseInfo") && "must be subclass of TableGen 'EnumAttrInfo' class"); } -EnumAttrCase::EnumAttrCase(const llvm::DefInit *init) +EnumAttrCase::EnumAttrCase(const DefInit *init) : EnumAttrCase(init->getDef()) {} StringRef EnumAttrCase::getSymbol() const { @@ -163,16 +162,16 @@ StringRef EnumAttrCase::getStr() const { return def->getValueAsString("str"); } int64_t EnumAttrCase::getValue() const { return def->getValueAsInt("value"); } -const llvm::Record &EnumAttrCase::getDef() const { return *def; } +const Record &EnumAttrCase::getDef() const { return *def; } -EnumAttr::EnumAttr(const llvm::Record *record) : Attribute(record) { +EnumAttr::EnumAttr(const Record *record) : Attribute(record) { assert(isSubClassOf("EnumAttrInfo") && "must be subclass of TableGen 'EnumAttr' class"); } -EnumAttr::EnumAttr(const llvm::Record &record) : Attribute(&record) {} +EnumAttr::EnumAttr(const Record &record) : Attribute(&record) {} -EnumAttr::EnumAttr(const llvm::DefInit *init) : EnumAttr(init->getDef()) {} +EnumAttr::EnumAttr(const DefInit *init) : EnumAttr(init->getDef()) {} bool EnumAttr::classof(const Attribute *attr) { return attr->isSubClassOf("EnumAttrInfo"); @@ -218,8 +217,8 @@ std::vector EnumAttr::getAllCases() const { std::vector cases; cases.reserve(inits->size()); - for (const llvm::Init *init : *inits) { - cases.emplace_back(cast(init)); + for (const Init *init : *inits) { + cases.emplace_back(cast(init)); } return cases; @@ -229,7 +228,7 @@ bool EnumAttr::genSpecializedAttr() const { return def->getValueAsBit("genSpecializedAttr"); } -const llvm::Record *EnumAttr::getBaseAttrClass() const { +const Record *EnumAttr::getBaseAttrClass() const { return def->getValueAsDef("baseAttrClass"); } diff --git a/mlir/lib/TableGen/Builder.cpp b/mlir/lib/TableGen/Builder.cpp index 044765c72601..a94e1cca5fc5 100644 --- a/mlir/lib/TableGen/Builder.cpp +++ b/mlir/lib/TableGen/Builder.cpp @@ -12,6 +12,11 @@ using namespace mlir; using namespace mlir::tblgen; +using llvm::DagInit; +using llvm::DefInit; +using llvm::Init; +using llvm::Record; +using llvm::StringInit; //===----------------------------------------------------------------------===// // Builder::Parameter @@ -19,9 +24,9 @@ using namespace mlir::tblgen; /// Return a string containing the C++ type of this parameter. StringRef Builder::Parameter::getCppType() const { - if (const auto *stringInit = dyn_cast(def)) + if (const auto *stringInit = dyn_cast(def)) return stringInit->getValue(); - const llvm::Record *record = cast(def)->getDef(); + const Record *record = cast(def)->getDef(); // Inlining the first part of `Record::getValueAsString` to give better // error messages. const llvm::RecordVal *type = record->getValue("type"); @@ -35,9 +40,9 @@ StringRef Builder::Parameter::getCppType() const { /// Return an optional string containing the default value to use for this /// parameter. std::optional Builder::Parameter::getDefaultValue() const { - if (isa(def)) + if (isa(def)) return std::nullopt; - const llvm::Record *record = cast(def)->getDef(); + const Record *record = cast(def)->getDef(); std::optional value = record->getValueAsOptionalString("defaultValue"); return value && !value->empty() ? value : std::nullopt; @@ -47,18 +52,17 @@ std::optional Builder::Parameter::getDefaultValue() const { // Builder //===----------------------------------------------------------------------===// -Builder::Builder(const llvm::Record *record, ArrayRef loc) - : def(record) { +Builder::Builder(const Record *record, ArrayRef loc) : def(record) { // Initialize the parameters of the builder. - const llvm::DagInit *dag = def->getValueAsDag("dagParams"); - auto *defInit = dyn_cast(dag->getOperator()); + const DagInit *dag = def->getValueAsDag("dagParams"); + auto *defInit = dyn_cast(dag->getOperator()); if (!defInit || defInit->getDef()->getName() != "ins") PrintFatalError(def->getLoc(), "expected 'ins' in builders"); bool seenDefaultValue = false; for (unsigned i = 0, e = dag->getNumArgs(); i < e; ++i) { - const llvm::StringInit *paramName = dag->getArgName(i); - const llvm::Init *paramValue = dag->getArg(i); + const StringInit *paramName = dag->getArgName(i); + const Init *paramValue = dag->getArg(i); Parameter param(paramName ? paramName->getValue() : std::optional(), paramValue); diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp index 2f13887aa0bb..747af1ce5a4d 100644 --- a/mlir/lib/TableGen/CodeGenHelpers.cpp +++ b/mlir/lib/TableGen/CodeGenHelpers.cpp @@ -24,32 +24,32 @@ using namespace mlir::tblgen; /// Generate a unique label based on the current file name to prevent name /// collisions if multiple generated files are included at once. -static std::string getUniqueOutputLabel(const llvm::RecordKeeper &records, +static std::string getUniqueOutputLabel(const RecordKeeper &records, StringRef tag) { // Use the input file name when generating a unique name. std::string inputFilename = records.getInputFilename(); // Drop all but the base filename. - StringRef nameRef = llvm::sys::path::filename(inputFilename); + StringRef nameRef = sys::path::filename(inputFilename); nameRef.consume_back(".td"); // Sanitize any invalid characters. std::string uniqueName(tag); for (char c : nameRef) { - if (llvm::isAlnum(c) || c == '_') + if (isAlnum(c) || c == '_') uniqueName.push_back(c); else - uniqueName.append(llvm::utohexstr((unsigned char)c)); + uniqueName.append(utohexstr((unsigned char)c)); } return uniqueName; } StaticVerifierFunctionEmitter::StaticVerifierFunctionEmitter( - raw_ostream &os, const llvm::RecordKeeper &records, StringRef tag) + raw_ostream &os, const RecordKeeper &records, StringRef tag) : os(os), uniqueOutputLabel(getUniqueOutputLabel(records, tag)) {} void StaticVerifierFunctionEmitter::emitOpConstraints( - ArrayRef opDefs) { + ArrayRef opDefs) { NamespaceEmitter namespaceEmitter(os, Operator(*opDefs[0]).getCppNamespace()); emitTypeConstraints(); emitAttrConstraints(); @@ -58,7 +58,7 @@ void StaticVerifierFunctionEmitter::emitOpConstraints( } void StaticVerifierFunctionEmitter::emitPatternConstraints( - const llvm::ArrayRef constraints) { + const ArrayRef constraints) { collectPatternConstraints(constraints); emitPatternConstraints(); } @@ -298,7 +298,7 @@ void StaticVerifierFunctionEmitter::collectOpConstraints( } void StaticVerifierFunctionEmitter::collectPatternConstraints( - const llvm::ArrayRef constraints) { + const ArrayRef constraints) { for (auto &leaf : constraints) { assert(leaf.isOperandMatcher() || leaf.isAttrMatcher()); collectConstraint( @@ -313,7 +313,7 @@ void StaticVerifierFunctionEmitter::collectPatternConstraints( std::string mlir::tblgen::escapeString(StringRef value) { std::string ret; - llvm::raw_string_ostream os(ret); + raw_string_ostream os(ret); os.write_escaped(value); return ret; } diff --git a/mlir/lib/TableGen/Interfaces.cpp b/mlir/lib/TableGen/Interfaces.cpp index 4a6709a43d0a..dc9a74c4e8a9 100644 --- a/mlir/lib/TableGen/Interfaces.cpp +++ b/mlir/lib/TableGen/Interfaces.cpp @@ -16,17 +16,22 @@ using namespace mlir; using namespace mlir::tblgen; +using llvm::DagInit; +using llvm::DefInit; +using llvm::Init; +using llvm::ListInit; +using llvm::Record; +using llvm::StringInit; //===----------------------------------------------------------------------===// // InterfaceMethod //===----------------------------------------------------------------------===// -InterfaceMethod::InterfaceMethod(const llvm::Record *def) : def(def) { - const llvm::DagInit *args = def->getValueAsDag("arguments"); +InterfaceMethod::InterfaceMethod(const Record *def) : def(def) { + const DagInit *args = def->getValueAsDag("arguments"); for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) { - arguments.push_back( - {llvm::cast(args->getArg(i))->getValue(), - args->getArgNameStr(i)}); + arguments.push_back({cast(args->getArg(i))->getValue(), + args->getArgNameStr(i)}); } } @@ -72,18 +77,17 @@ bool InterfaceMethod::arg_empty() const { return arguments.empty(); } // Interface //===----------------------------------------------------------------------===// -Interface::Interface(const llvm::Record *def) : def(def) { +Interface::Interface(const Record *def) : def(def) { assert(def->isSubClassOf("Interface") && "must be subclass of TableGen 'Interface' class"); // Initialize the interface methods. - auto *listInit = dyn_cast(def->getValueInit("methods")); - for (const llvm::Init *init : listInit->getValues()) - methods.emplace_back(cast(init)->getDef()); + auto *listInit = dyn_cast(def->getValueInit("methods")); + for (const Init *init : listInit->getValues()) + methods.emplace_back(cast(init)->getDef()); // Initialize the interface base classes. - auto *basesInit = - dyn_cast(def->getValueInit("baseInterfaces")); + auto *basesInit = dyn_cast(def->getValueInit("baseInterfaces")); // Chained inheritance will produce duplicates in the base interface set. StringSet<> basesAdded; llvm::unique_function addBaseInterfaceFn = @@ -98,8 +102,8 @@ Interface::Interface(const llvm::Record *def) : def(def) { baseInterfaces.push_back(std::make_unique(baseInterface)); basesAdded.insert(baseInterface.getName()); }; - for (const llvm::Init *init : basesInit->getValues()) - addBaseInterfaceFn(Interface(cast(init)->getDef())); + for (const Init *init : basesInit->getValues()) + addBaseInterfaceFn(Interface(cast(init)->getDef())); } // Return the name of this interface. diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index 86670e9f8712..904cc6637d53 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -35,9 +35,12 @@ using namespace mlir::tblgen; using llvm::DagInit; using llvm::DefInit; +using llvm::Init; +using llvm::ListInit; using llvm::Record; +using llvm::StringInit; -Operator::Operator(const llvm::Record &def) +Operator::Operator(const Record &def) : dialect(def.getValueAsDef("opDialect")), def(def) { // The first `_` in the op's TableGen def name is treated as separating the // dialect prefix and the op class name. The dialect prefix will be ignored if @@ -179,7 +182,7 @@ StringRef Operator::getExtraClassDefinition() const { return def.getValueAsString(attr); } -const llvm::Record &Operator::getDef() const { return def; } +const Record &Operator::getDef() const { return def; } bool Operator::skipDefaultBuilders() const { return def.getValueAsBit("skipDefaultBuilders"); @@ -429,7 +432,7 @@ void Operator::populateTypeInferenceInfo( // Use `AllTypesMatch` and `TypesMatchWith` operation traits to build the // result type inference graph. for (const Trait &trait : traits) { - const llvm::Record &def = trait.getDef(); + const Record &def = trait.getDef(); // If the infer type op interface was manually added, then treat it as // intention that the op needs special handling. @@ -614,9 +617,8 @@ void Operator::populateOpStructure() { def.getLoc(), "unsupported attribute modelling, only single class expected"); } - attributes.push_back( - {cast(val.getNameInit())->getValue(), - Attribute(cast(val.getValue()))}); + attributes.push_back({cast(val.getNameInit())->getValue(), + Attribute(cast(val.getValue()))}); } } @@ -701,7 +703,7 @@ void Operator::populateOpStructure() { // tablegen is easy, making them unique less so, so dedupe here. if (auto *traitList = def.getValueAsListInit("traits")) { // This is uniquing based on pointers of the trait. - SmallPtrSet traitSet; + SmallPtrSet traitSet; traits.reserve(traitSet.size()); // The declaration order of traits imply the verification order of traits. @@ -721,8 +723,8 @@ void Operator::populateOpStructure() { " to precede it in traits list"); }; - std::function insert; - insert = [&](const llvm::ListInit *traitList) { + std::function insert; + insert = [&](const ListInit *traitList) { for (auto *traitInit : *traitList) { auto *def = cast(traitInit)->getDef(); if (def->isSubClassOf("TraitList")) { @@ -777,11 +779,10 @@ void Operator::populateOpStructure() { } // Populate the builders. - auto *builderList = - dyn_cast_or_null(def.getValueInit("builders")); + auto *builderList = dyn_cast_or_null(def.getValueInit("builders")); if (builderList && !builderList->empty()) { - for (const llvm::Init *init : builderList->getValues()) - builders.emplace_back(cast(init)->getDef(), def.getLoc()); + for (const Init *init : builderList->getValues()) + builders.emplace_back(cast(init)->getDef(), def.getLoc()); } else if (skipDefaultBuilders()) { PrintFatalError( def.getLoc(), @@ -814,13 +815,12 @@ StringRef Operator::getSummary() const { bool Operator::hasAssemblyFormat() const { auto *valueInit = def.getValueInit("assemblyFormat"); - return isa(valueInit); + return isa(valueInit); } StringRef Operator::getAssemblyFormat() const { - return TypeSwitch( - def.getValueInit("assemblyFormat")) - .Case([&](auto *init) { return init->getValue(); }); + return TypeSwitch(def.getValueInit("assemblyFormat")) + .Case([&](auto *init) { return init->getValue(); }); } void Operator::print(llvm::raw_ostream &os) const { @@ -833,9 +833,9 @@ void Operator::print(llvm::raw_ostream &os) const { } } -auto Operator::VariableDecoratorIterator::unwrap(const llvm::Init *init) +auto Operator::VariableDecoratorIterator::unwrap(const Init *init) -> VariableDecorator { - return VariableDecorator(cast(init)->getDef()); + return VariableDecorator(cast(init)->getDef()); } auto Operator::getArgToOperandOrAttribute(int index) const diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp index bee20354387f..ffa0c067b028 100644 --- a/mlir/lib/TableGen/Pattern.cpp +++ b/mlir/lib/TableGen/Pattern.cpp @@ -26,7 +26,12 @@ using namespace mlir; using namespace tblgen; +using llvm::DagInit; +using llvm::dbgs; +using llvm::DefInit; using llvm::formatv; +using llvm::IntInit; +using llvm::Record; //===----------------------------------------------------------------------===// // DagLeaf @@ -61,31 +66,31 @@ bool DagLeaf::isStringAttr() const { return isa(def); } Constraint DagLeaf::getAsConstraint() const { assert((isOperandMatcher() || isAttrMatcher()) && "the DAG leaf must be operand or attribute"); - return Constraint(cast(def)->getDef()); + return Constraint(cast(def)->getDef()); } ConstantAttr DagLeaf::getAsConstantAttr() const { assert(isConstantAttr() && "the DAG leaf must be constant attribute"); - return ConstantAttr(cast(def)); + return ConstantAttr(cast(def)); } EnumAttrCase DagLeaf::getAsEnumAttrCase() const { assert(isEnumAttrCase() && "the DAG leaf must be an enum attribute case"); - return EnumAttrCase(cast(def)); + return EnumAttrCase(cast(def)); } std::string DagLeaf::getConditionTemplate() const { return getAsConstraint().getConditionTemplate(); } -llvm::StringRef DagLeaf::getNativeCodeTemplate() const { +StringRef DagLeaf::getNativeCodeTemplate() const { assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall"); - return cast(def)->getDef()->getValueAsString("expression"); + return cast(def)->getDef()->getValueAsString("expression"); } int DagLeaf::getNumReturnsOfNativeCode() const { assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall"); - return cast(def)->getDef()->getValueAsInt("numReturns"); + return cast(def)->getDef()->getValueAsInt("numReturns"); } std::string DagLeaf::getStringAttr() const { @@ -93,7 +98,7 @@ std::string DagLeaf::getStringAttr() const { return def->getAsUnquotedString(); } bool DagLeaf::isSubClassOf(StringRef superclass) const { - if (auto *defInit = dyn_cast_or_null(def)) + if (auto *defInit = dyn_cast_or_null(def)) return defInit->getDef()->isSubClassOf(superclass); return false; } @@ -108,7 +113,7 @@ void DagLeaf::print(raw_ostream &os) const { //===----------------------------------------------------------------------===// bool DagNode::isNativeCodeCall() const { - if (auto *defInit = dyn_cast_or_null(node->getOperator())) + if (auto *defInit = dyn_cast_or_null(node->getOperator())) return defInit->getDef()->isSubClassOf("NativeCodeCall"); return false; } @@ -119,25 +124,24 @@ bool DagNode::isOperation() const { !isVariadic(); } -llvm::StringRef DagNode::getNativeCodeTemplate() const { +StringRef DagNode::getNativeCodeTemplate() const { assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall"); - return cast(node->getOperator()) + return cast(node->getOperator()) ->getDef() ->getValueAsString("expression"); } int DagNode::getNumReturnsOfNativeCode() const { assert(isNativeCodeCall() && "the DAG leaf must be NativeCodeCall"); - return cast(node->getOperator()) + return cast(node->getOperator()) ->getDef() ->getValueAsInt("numReturns"); } -llvm::StringRef DagNode::getSymbol() const { return node->getNameStr(); } +StringRef DagNode::getSymbol() const { return node->getNameStr(); } Operator &DagNode::getDialectOp(RecordOperatorMap *mapper) const { - const llvm::Record *opDef = - cast(node->getOperator())->getDef(); + const Record *opDef = cast(node->getOperator())->getDef(); auto [it, inserted] = mapper->try_emplace(opDef); if (inserted) it->second = std::make_unique(opDef); @@ -158,11 +162,11 @@ int DagNode::getNumOps() const { int DagNode::getNumArgs() const { return node->getNumArgs(); } bool DagNode::isNestedDagArg(unsigned index) const { - return isa(node->getArg(index)); + return isa(node->getArg(index)); } DagNode DagNode::getArgAsNestedDag(unsigned index) const { - return DagNode(dyn_cast_or_null(node->getArg(index))); + return DagNode(dyn_cast_or_null(node->getArg(index))); } DagLeaf DagNode::getArgAsLeaf(unsigned index) const { @@ -175,27 +179,27 @@ StringRef DagNode::getArgName(unsigned index) const { } bool DagNode::isReplaceWithValue() const { - auto *dagOpDef = cast(node->getOperator())->getDef(); + auto *dagOpDef = cast(node->getOperator())->getDef(); return dagOpDef->getName() == "replaceWithValue"; } bool DagNode::isLocationDirective() const { - auto *dagOpDef = cast(node->getOperator())->getDef(); + auto *dagOpDef = cast(node->getOperator())->getDef(); return dagOpDef->getName() == "location"; } bool DagNode::isReturnTypeDirective() const { - auto *dagOpDef = cast(node->getOperator())->getDef(); + auto *dagOpDef = cast(node->getOperator())->getDef(); return dagOpDef->getName() == "returnType"; } bool DagNode::isEither() const { - auto *dagOpDef = cast(node->getOperator())->getDef(); + auto *dagOpDef = cast(node->getOperator())->getDef(); return dagOpDef->getName() == "either"; } bool DagNode::isVariadic() const { - auto *dagOpDef = cast(node->getOperator())->getDef(); + auto *dagOpDef = cast(node->getOperator())->getDef(); return dagOpDef->getName() == "variadic"; } @@ -246,7 +250,7 @@ std::string SymbolInfoMap::SymbolInfo::getVarName(StringRef name) const { } std::string SymbolInfoMap::SymbolInfo::getVarTypeStr(StringRef name) const { - LLVM_DEBUG(llvm::dbgs() << "getVarTypeStr for '" << name << "': "); + LLVM_DEBUG(dbgs() << "getVarTypeStr for '" << name << "': "); switch (kind) { case Kind::Attr: { if (op) @@ -277,26 +281,26 @@ std::string SymbolInfoMap::SymbolInfo::getVarTypeStr(StringRef name) const { } std::string SymbolInfoMap::SymbolInfo::getVarDecl(StringRef name) const { - LLVM_DEBUG(llvm::dbgs() << "getVarDecl for '" << name << "': "); + LLVM_DEBUG(dbgs() << "getVarDecl for '" << name << "': "); std::string varInit = kind == Kind::Operand ? "(op0->getOperands())" : ""; return std::string( formatv("{0} {1}{2};\n", getVarTypeStr(name), getVarName(name), varInit)); } std::string SymbolInfoMap::SymbolInfo::getArgDecl(StringRef name) const { - LLVM_DEBUG(llvm::dbgs() << "getArgDecl for '" << name << "': "); + LLVM_DEBUG(dbgs() << "getArgDecl for '" << name << "': "); return std::string( formatv("{0} &{1}", getVarTypeStr(name), getVarName(name))); } std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( StringRef name, int index, const char *fmt, const char *separator) const { - LLVM_DEBUG(llvm::dbgs() << "getValueAndRangeUse for '" << name << "': "); + LLVM_DEBUG(dbgs() << "getValueAndRangeUse for '" << name << "': "); switch (kind) { case Kind::Attr: { assert(index < 0); auto repl = formatv(fmt, name); - LLVM_DEBUG(llvm::dbgs() << repl << " (Attr)\n"); + LLVM_DEBUG(dbgs() << repl << " (Attr)\n"); return std::string(repl); } case Kind::Operand: { @@ -307,11 +311,11 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( // the value itself. if (operand->isVariableLength() && !getVariadicSubIndex().has_value()) { auto repl = formatv(fmt, name); - LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicOperand)\n"); + LLVM_DEBUG(dbgs() << repl << " (VariadicOperand)\n"); return std::string(repl); } auto repl = formatv(fmt, formatv("(*{0}.begin())", name)); - LLVM_DEBUG(llvm::dbgs() << repl << " (SingleOperand)\n"); + LLVM_DEBUG(dbgs() << repl << " (SingleOperand)\n"); return std::string(repl); } case Kind::Result: { @@ -323,14 +327,14 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( if (!op->getResult(index).isVariadic()) v = std::string(formatv("(*{0}.begin())", v)); auto repl = formatv(fmt, v); - LLVM_DEBUG(llvm::dbgs() << repl << " (SingleResult)\n"); + LLVM_DEBUG(dbgs() << repl << " (SingleResult)\n"); return std::string(repl); } // If this op has no result at all but still we bind a symbol to it, it // means we want to capture the op itself. if (op->getNumResults() == 0) { - LLVM_DEBUG(llvm::dbgs() << name << " (Op)\n"); + LLVM_DEBUG(dbgs() << name << " (Op)\n"); return formatv(fmt, name); } @@ -347,14 +351,14 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( values.push_back(std::string(formatv(fmt, v))); } auto repl = llvm::join(values, separator); - LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicResult)\n"); + LLVM_DEBUG(dbgs() << repl << " (VariadicResult)\n"); return repl; } case Kind::Value: { assert(index < 0); assert(op == nullptr); auto repl = formatv(fmt, name); - LLVM_DEBUG(llvm::dbgs() << repl << " (Value)\n"); + LLVM_DEBUG(dbgs() << repl << " (Value)\n"); return std::string(repl); } case Kind::MultipleValues: { @@ -363,13 +367,13 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( if (index >= 0) { std::string repl = formatv(fmt, std::string(formatv("{0}[{1}]", name, index))); - LLVM_DEBUG(llvm::dbgs() << repl << " (MultipleValues)\n"); + LLVM_DEBUG(dbgs() << repl << " (MultipleValues)\n"); return repl; } // If it doesn't specify certain element, unpack them all. auto repl = formatv(fmt, std::string(formatv("{0}.begin(), {0}.end()", name))); - LLVM_DEBUG(llvm::dbgs() << repl << " (MultipleValues)\n"); + LLVM_DEBUG(dbgs() << repl << " (MultipleValues)\n"); return std::string(repl); } } @@ -378,19 +382,19 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( std::string SymbolInfoMap::SymbolInfo::getAllRangeUse( StringRef name, int index, const char *fmt, const char *separator) const { - LLVM_DEBUG(llvm::dbgs() << "getAllRangeUse for '" << name << "': "); + LLVM_DEBUG(dbgs() << "getAllRangeUse for '" << name << "': "); switch (kind) { case Kind::Attr: case Kind::Operand: { assert(index < 0 && "only allowed for symbol bound to result"); auto repl = formatv(fmt, name); - LLVM_DEBUG(llvm::dbgs() << repl << " (Operand/Attr)\n"); + LLVM_DEBUG(dbgs() << repl << " (Operand/Attr)\n"); return std::string(repl); } case Kind::Result: { if (index >= 0) { auto repl = formatv(fmt, formatv("{0}.getODSResults({1})", name, index)); - LLVM_DEBUG(llvm::dbgs() << repl << " (SingleResult)\n"); + LLVM_DEBUG(dbgs() << repl << " (SingleResult)\n"); return std::string(repl); } @@ -404,14 +408,14 @@ std::string SymbolInfoMap::SymbolInfo::getAllRangeUse( formatv(fmt, formatv("{0}.getODSResults({1})", name, i)))); } auto repl = llvm::join(values, separator); - LLVM_DEBUG(llvm::dbgs() << repl << " (VariadicResult)\n"); + LLVM_DEBUG(dbgs() << repl << " (VariadicResult)\n"); return repl; } case Kind::Value: { assert(index < 0 && "only allowed for symbol bound to result"); assert(op == nullptr); auto repl = formatv(fmt, formatv("{{{0}}", name)); - LLVM_DEBUG(llvm::dbgs() << repl << " (Value)\n"); + LLVM_DEBUG(dbgs() << repl << " (Value)\n"); return std::string(repl); } case Kind::MultipleValues: { @@ -420,12 +424,12 @@ std::string SymbolInfoMap::SymbolInfo::getAllRangeUse( if (index >= 0) { std::string repl = formatv(fmt, std::string(formatv("{0}[{1}]", name, index))); - LLVM_DEBUG(llvm::dbgs() << repl << " (MultipleValues)\n"); + LLVM_DEBUG(dbgs() << repl << " (MultipleValues)\n"); return repl; } auto repl = formatv(fmt, std::string(formatv("{0}.begin(), {0}.end()", name))); - LLVM_DEBUG(llvm::dbgs() << repl << " (MultipleValues)\n"); + LLVM_DEBUG(dbgs() << repl << " (MultipleValues)\n"); return std::string(repl); } } @@ -614,7 +618,7 @@ void SymbolInfoMap::assignUniqueAlternativeNames() { // Pattern //==----------------------------------------------------------------------===// -Pattern::Pattern(const llvm::Record *def, RecordOperatorMap *mapper) +Pattern::Pattern(const Record *def, RecordOperatorMap *mapper) : def(*def), recordOpMap(mapper) {} DagNode Pattern::getSourcePattern() const { @@ -628,26 +632,26 @@ int Pattern::getNumResultPatterns() const { DagNode Pattern::getResultPattern(unsigned index) const { auto *results = def.getValueAsListInit("resultPatterns"); - return DagNode(cast(results->getElement(index))); + return DagNode(cast(results->getElement(index))); } void Pattern::collectSourcePatternBoundSymbols(SymbolInfoMap &infoMap) { - LLVM_DEBUG(llvm::dbgs() << "start collecting source pattern bound symbols\n"); + LLVM_DEBUG(dbgs() << "start collecting source pattern bound symbols\n"); collectBoundSymbols(getSourcePattern(), infoMap, /*isSrcPattern=*/true); - LLVM_DEBUG(llvm::dbgs() << "done collecting source pattern bound symbols\n"); + LLVM_DEBUG(dbgs() << "done collecting source pattern bound symbols\n"); - LLVM_DEBUG(llvm::dbgs() << "start assigning alternative names for symbols\n"); + LLVM_DEBUG(dbgs() << "start assigning alternative names for symbols\n"); infoMap.assignUniqueAlternativeNames(); - LLVM_DEBUG(llvm::dbgs() << "done assigning alternative names for symbols\n"); + LLVM_DEBUG(dbgs() << "done assigning alternative names for symbols\n"); } void Pattern::collectResultPatternBoundSymbols(SymbolInfoMap &infoMap) { - LLVM_DEBUG(llvm::dbgs() << "start collecting result pattern bound symbols\n"); + LLVM_DEBUG(dbgs() << "start collecting result pattern bound symbols\n"); for (int i = 0, e = getNumResultPatterns(); i < e; ++i) { auto pattern = getResultPattern(i); collectBoundSymbols(pattern, infoMap, /*isSrcPattern=*/false); } - LLVM_DEBUG(llvm::dbgs() << "done collecting result pattern bound symbols\n"); + LLVM_DEBUG(dbgs() << "done collecting result pattern bound symbols\n"); } const Operator &Pattern::getSourceRootOp() { @@ -664,7 +668,7 @@ std::vector Pattern::getConstraints() const { ret.reserve(listInit->size()); for (auto *it : *listInit) { - auto *dagInit = dyn_cast(it); + auto *dagInit = dyn_cast(it); if (!dagInit) PrintFatalError(&def, "all elements in Pattern multi-entity " "constraints should be DAG nodes"); @@ -680,7 +684,7 @@ std::vector Pattern::getConstraints() const { entities.emplace_back(argName->getValue()); } - ret.emplace_back(cast(dagInit->getOperator())->getDef(), + ret.emplace_back(cast(dagInit->getOperator())->getDef(), dagInit->getNameStr(), std::move(entities)); } return ret; @@ -693,19 +697,19 @@ int Pattern::getNumSupplementalPatterns() const { DagNode Pattern::getSupplementalPattern(unsigned index) const { auto *results = def.getValueAsListInit("supplementalPatterns"); - return DagNode(cast(results->getElement(index))); + return DagNode(cast(results->getElement(index))); } int Pattern::getBenefit() const { // The initial benefit value is a heuristic with number of ops in the source // pattern. int initBenefit = getSourcePattern().getNumOps(); - const llvm::DagInit *delta = def.getValueAsDag("benefitDelta"); - if (delta->getNumArgs() != 1 || !isa(delta->getArg(0))) { + const DagInit *delta = def.getValueAsDag("benefitDelta"); + if (delta->getNumArgs() != 1 || !isa(delta->getArg(0))) { PrintFatalError(&def, "The 'addBenefit' takes and only takes one integer value"); } - return initBenefit + dyn_cast(delta->getArg(0))->getValue(); + return initBenefit + dyn_cast(delta->getArg(0))->getValue(); } std::vector Pattern::getLocation() const { @@ -736,8 +740,8 @@ void Pattern::collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap, if (tree.isNativeCodeCall()) { if (!treeName.empty()) { if (!isSrcPattern) { - LLVM_DEBUG(llvm::dbgs() << "found symbol bound to NativeCodeCall: " - << treeName << '\n'); + LLVM_DEBUG(dbgs() << "found symbol bound to NativeCodeCall: " + << treeName << '\n'); verifyBind( infoMap.bindValues(treeName, tree.getNumReturnsOfNativeCode()), treeName); @@ -820,8 +824,8 @@ void Pattern::collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap, // The name attached to the DAG node's operator is for representing the // results generated from this op. It should be remembered as bound results. if (!treeName.empty()) { - LLVM_DEBUG(llvm::dbgs() - << "found symbol bound to op result: " << treeName << '\n'); + LLVM_DEBUG(dbgs() << "found symbol bound to op result: " << treeName + << '\n'); verifyBind(infoMap.bindOpResult(treeName, op), treeName); } @@ -896,8 +900,8 @@ void Pattern::collectBoundSymbols(DagNode tree, SymbolInfoMap &infoMap, auto treeArgName = tree.getArgName(i); // `$_` is a special symbol meaning ignore the current argument. if (!treeArgName.empty() && treeArgName != "_") { - LLVM_DEBUG(llvm::dbgs() << "found symbol bound to op argument: " - << treeArgName << '\n'); + LLVM_DEBUG(dbgs() << "found symbol bound to op argument: " + << treeArgName << '\n'); verifyBind(infoMap.bindOpArgument(tree, treeArgName, op, opArgIdx), treeArgName); } diff --git a/mlir/lib/TableGen/Predicate.cpp b/mlir/lib/TableGen/Predicate.cpp index 0e38dab8491c..f71dd0bd35f8 100644 --- a/mlir/lib/TableGen/Predicate.cpp +++ b/mlir/lib/TableGen/Predicate.cpp @@ -20,15 +20,18 @@ using namespace mlir; using namespace tblgen; +using llvm::Init; +using llvm::Record; +using llvm::SpecificBumpPtrAllocator; // Construct a Predicate from a record. -Pred::Pred(const llvm::Record *record) : def(record) { +Pred::Pred(const Record *record) : def(record) { assert(def->isSubClassOf("Pred") && "must be a subclass of TableGen 'Pred' class"); } // Construct a Predicate from an initializer. -Pred::Pred(const llvm::Init *init) { +Pred::Pred(const Init *init) { if (const auto *defInit = dyn_cast_or_null(init)) def = defInit->getDef(); } @@ -48,12 +51,12 @@ bool Pred::isCombined() const { ArrayRef Pred::getLoc() const { return def->getLoc(); } -CPred::CPred(const llvm::Record *record) : Pred(record) { +CPred::CPred(const Record *record) : Pred(record) { assert(def->isSubClassOf("CPred") && "must be a subclass of Tablegen 'CPred' class"); } -CPred::CPred(const llvm::Init *init) : Pred(init) { +CPred::CPred(const Init *init) : Pred(init) { assert((!def || def->isSubClassOf("CPred")) && "must be a subclass of Tablegen 'CPred' class"); } @@ -64,22 +67,22 @@ std::string CPred::getConditionImpl() const { return std::string(def->getValueAsString("predExpr")); } -CombinedPred::CombinedPred(const llvm::Record *record) : Pred(record) { +CombinedPred::CombinedPred(const Record *record) : Pred(record) { assert(def->isSubClassOf("CombinedPred") && "must be a subclass of Tablegen 'CombinedPred' class"); } -CombinedPred::CombinedPred(const llvm::Init *init) : Pred(init) { +CombinedPred::CombinedPred(const Init *init) : Pred(init) { assert((!def || def->isSubClassOf("CombinedPred")) && "must be a subclass of Tablegen 'CombinedPred' class"); } -const llvm::Record *CombinedPred::getCombinerDef() const { +const Record *CombinedPred::getCombinerDef() const { assert(def->getValue("kind") && "CombinedPred must have a value 'kind'"); return def->getValueAsDef("kind"); } -std::vector CombinedPred::getChildren() const { +std::vector CombinedPred::getChildren() const { assert(def->getValue("children") && "CombinedPred must have a value 'children'"); return def->getValueAsListOfDefs("children"); @@ -156,7 +159,7 @@ static void performSubstitutions(std::string &str, // All nodes are created within "allocator". static PredNode * buildPredicateTree(const Pred &root, - llvm::SpecificBumpPtrAllocator &allocator, + SpecificBumpPtrAllocator &allocator, ArrayRef substitutions) { auto *rootNode = allocator.Allocate(); new (rootNode) PredNode; @@ -351,7 +354,7 @@ static std::string getCombinedCondition(const PredNode &root) { } std::string CombinedPred::getConditionImpl() const { - llvm::SpecificBumpPtrAllocator allocator; + SpecificBumpPtrAllocator allocator; auto *predicateTree = buildPredicateTree(*this, allocator, {}); predicateTree = propagateGroundTruth(predicateTree, diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp index c3b813ec598d..4f74056947ab 100644 --- a/mlir/lib/TableGen/Type.cpp +++ b/mlir/lib/TableGen/Type.cpp @@ -18,6 +18,7 @@ using namespace mlir; using namespace mlir::tblgen; +using llvm::Record; TypeConstraint::TypeConstraint(const llvm::DefInit *init) : TypeConstraint(init->getDef()) {} @@ -42,7 +43,7 @@ StringRef TypeConstraint::getVariadicOfVariadicSegmentSizeAttr() const { // Returns the builder call for this constraint if this is a buildable type, // returns std::nullopt otherwise. std::optional TypeConstraint::getBuilderCall() const { - const llvm::Record *baseType = def; + const Record *baseType = def; if (isVariableLength()) baseType = baseType->getValueAsDef("baseType"); @@ -64,7 +65,7 @@ StringRef TypeConstraint::getCppType() const { return def->getValueAsString("cppType"); } -Type::Type(const llvm::Record *record) : TypeConstraint(record) {} +Type::Type(const Record *record) : TypeConstraint(record) {} Dialect Type::getDialect() const { return Dialect(def->getValueAsDef("dialect")); -- GitLab From 03dcd88c781d06f917750f3a7f6df9ac7f7f67d9 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 18 Oct 2024 22:33:35 +0100 Subject: [PATCH 120/511] [RISCV][ISel] Ensure 'in X' Constraints prevent X0 (#112563) I'm not sure if this fix is required, but I've written the patch anyway. This does not cause test changes, but we haven't got tests that try to use all 32 registers in inline assembly. Broadly, for GPRs, we made the explicit choice that `r` constraints would never attempt to use `x0`, because `x0` isn't really usable like the other GPRs. I believe the same thing applies to `Zhinx`, `Zfinx` and `Zdinx` because they should not be allocating operands to `x0` either, so this patch introduces new `NoX0` classes for `GPRF16` and `GPRF32` registers, and uses them with inline assembly. There is also a `GPRPairNoX0` for the `Zdinx` case on rv32, avoiding use of the `x0` pair which has different behaviour to the other GPR pairs. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++--- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 60ac58f824ed..fbd2f47d2769 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20385,11 +20385,11 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT.isVector()) break; if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) - return std::make_pair(0U, &RISCV::GPRF16RegClass); + return std::make_pair(0U, &RISCV::GPRF16NoX0RegClass); if (VT == MVT::f32 && Subtarget.hasStdExtZfinx()) - return std::make_pair(0U, &RISCV::GPRF32RegClass); + return std::make_pair(0U, &RISCV::GPRF32NoX0RegClass); if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) - return std::make_pair(0U, &RISCV::GPRPairRegClass); + return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); return std::make_pair(0U, &RISCV::GPRNoX0RegClass); case 'f': if (Subtarget.hasStdExtZfhmin() && VT == MVT::f16) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 250f3c10f309..685f04213afa 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -661,6 +661,7 @@ def GPRF16 : RISCVRegisterClass<[f16], 16, (add (sequence "X%u_H", 10, 17), (sequence "X%u_H", 0, 4))>; def GPRF16C : RISCVRegisterClass<[f16], 16, (add (sequence "X%u_H", 10, 15), (sequence "X%u_H", 8, 9))>; +def GPRF16NoX0 : RISCVRegisterClass<[f16], 16, (sub GPRF16, X0_H)>; def GPRF32 : RISCVRegisterClass<[f32], 32, (add (sequence "X%u_W", 10, 17), (sequence "X%u_W", 5, 7), @@ -721,6 +722,8 @@ def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add def GPRPairC : RISCVRegisterClass<[XLenPairFVT], 64, (add X10_X11, X12_X13, X14_X15, X8_X9 )>; + +def GPRPairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRPair, X0_Pair)>; } // let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass" // The register class is added for inline assembly for vector mask types. -- GitLab From e26151913cbfeb52f3e16098707b5e5ddc413b17 Mon Sep 17 00:00:00 2001 From: Nikhil Kalra Date: Fri, 18 Oct 2024 15:22:39 -0700 Subject: [PATCH 121/511] [mlir] Allow CXX standard to be overridden (#112957) MLIR previously hardcoded the CXX version to C++17. Updated to allow for the CXX version to be set by clients (mirrors other LLVM projects). --- mlir/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index 599a1cbaafd8..1e80daabddec 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -16,7 +16,7 @@ endif() # Must go below project(..) include(GNUInstallDirs) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to") if(MLIR_STANDALONE_BUILD) find_package(LLVM CONFIG REQUIRED) -- GitLab From c7496cebac047665dbe9460d536c7654bc643a43 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 18 Oct 2024 15:22:59 -0700 Subject: [PATCH 122/511] [LV] Use SCEV to check if minimum iteration check is known. (#111310) Use SCEV to check if the minimum iteration check (TC < Step) is known to be false. This is a first step towards addressing https://github.com/llvm/llvm-project/issues/111098. To catch the exact case from the issue, we need to do extra work to make sure the wrap flags on the shl are preserved and used by SCEV. Note that skeleton creation will be gradually moved to VPlan and this simplification should be done as VPlan transform eventually. The current plan is to move skeleton creation to VPlan starting from parts closest to the parts already created by VPlan, starting with induction resume value creation (started with https://github.com/llvm/llvm-project/pull/110577), then memory and SCEV checks and finally minimum iteration checks. PR: https://github.com/llvm/llvm-project/pull/111310 --- .../Transforms/Vectorize/LoopVectorize.cpp | 38 +++++++++++++--- .../AArch64/eliminate-tail-predication.ll | 3 +- .../LoopVectorize/AArch64/masked-call.ll | 44 ++++--------------- .../AArch64/pr60831-sve-inv-store-crash.ll | 3 +- .../LoopVectorize/AArch64/sve-tail-folding.ll | 3 +- .../AArch64/wider-VF-for-callinst.ll | 5 +-- .../Transforms/LoopVectorize/if-reduction.ll | 4 +- .../version-stride-with-integer-casts.ll | 7 ++- 8 files changed, 49 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e377e1d82037..857efbdf687c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2447,12 +2447,26 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { }; TailFoldingStyle Style = Cost->getTailFoldingStyle(); - if (Style == TailFoldingStyle::None) - CheckMinIters = - Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); - else if (VF.isScalable() && - !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && - Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + if (Style == TailFoldingStyle::None) { + Value *Step = CreateStep(); + ScalarEvolution &SE = *PSE.getSE(); + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. + const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); + // Check if the trip count is < the step. + if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { + // TODO: Ensure step is at most the trip count when determining max VF and + // UF, w/o tail folding. + CheckMinIters = Builder.getTrue(); + } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), + TripCountSCEV, SE.getSCEV(Step))) { + // Generate the minimum iteration check only if we cannot prove the + // check is known to be true, or known to be false. + CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + } // else step known to be < trip count, use CheckMinIters preset to false. + } else if (VF.isScalable() && + !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && + Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an // additional overflow check is required before entering the vector loop. @@ -2462,8 +2476,18 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { ConstantInt::get(CountTy, cast(CountTy)->getMask()); Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); + Value *Step = CreateStep(); +#ifndef NDEBUG + ScalarEvolution &SE = *PSE.getSE(); + const SCEV *TC2OverflowSCEV = SE.applyLoopGuards(SE.getSCEV(LHS), OrigLoop); + assert( + !isIndvarOverflowCheckKnownFalse(Cost, VF * UF) && + !SE.isKnownPredicate(CmpInst::getInversePredicate(ICmpInst::ICMP_ULT), + TC2OverflowSCEV, SE.getSCEV(Step)) && + "unexpectedly proved overflow check to be known"); +#endif // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); } // Create new preheader for vector loop. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll index 8c50d86489c9..7dcab6d807cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -11,8 +11,7 @@ define void @f1(ptr %A) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 93034f4dbe56..5496eed16e54 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -11,10 +11,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -146,10 +143,7 @@ for.cond.cleanup: define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_if_then( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -310,10 +304,7 @@ for.cond.cleanup: define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_if_then_else( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -490,10 +481,7 @@ for.cond.cleanup: define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_nomask( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -548,11 +536,6 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; ; TFFALLBACK-LABEL: @test_widen_nomask( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; TFFALLBACK: vector.ph: ; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] @@ -561,7 +544,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: -; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; TFFALLBACK-NEXT: [[TMP7:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) @@ -569,12 +552,9 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TFFALLBACK: for.body: -; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] @@ -626,10 +606,7 @@ for.cond.cleanup: define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-LABEL: @test_widen_optmask( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -791,10 +768,7 @@ for.cond.cleanup: define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, double %m) #4 { ; TFNONE-LABEL: @test_widen_fmuladd_and_call( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: ; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll index 0e95d742092e..d18cdc1ae617 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll @@ -10,8 +10,7 @@ define void @test_invar_gep(ptr %dst) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 94b90aa3cfb3..1d150141e625 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -757,8 +757,7 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 4a2f9d07ed91..4a3bc4679bba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -7,10 +7,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; WIDE-LABEL: @test_widen( ; WIDE-NEXT: entry: -; WIDE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; WIDE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; WIDE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; WIDE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; WIDE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; WIDE: vector.ph: ; WIDE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index 383b62b368ef..330cdeaeb7c2 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1659,6 +1659,7 @@ for.end: ; preds = %for.body, %entry ret i64 %1 } +; FIXME: %indvars.iv.next is poison on first iteration due to sub nuw 0, 1. define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly { ; CHECK-LABEL: define i32 @fcmp_0_sub_select1( ; CHECK-SAME: ptr noalias [[X:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1668,8 +1669,7 @@ define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly { ; CHECK: [[FOR_HEADER]]: ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 0, [[ZEXT]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index b3ec3e8f0f3c..5e65832aba8c 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -415,6 +415,7 @@ exit: ; Test case to make sure that uses of versioned strides of type i1 are properly ; extended. From https://github.com/llvm/llvm-project/issues/91369. +; TODO: Better check (udiv i64 15, %g.64) after checking if %g == 1. define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-LABEL: define void @zext_of_i1_stride( ; CHECK-SAME: i1 [[G:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] { @@ -423,8 +424,7 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[G_64:%.*]] = zext i1 [[G]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 15, [[G_64]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -489,8 +489,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[G_64]] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -- GitLab From f4c6088346fa284412f13a24116836ff64b6bd4b Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 15:23:52 -0700 Subject: [PATCH 123/511] [lsan] Process non-suspended threads (#112807) For such threads we have no registers, so no exact stack range, and no guaranties that stack is mapped at all. To avoid crashes on unmapped memory, `MemCpyAccessible` copies intersting range into temporarily buffer, and we search for pointers there. --- compiler-rt/lib/lsan/lsan_common.cpp | 41 ++++++++++++++++++++++++++++ compiler-rt/lib/lsan/lsan_flags.inc | 2 ++ 2 files changed, 43 insertions(+) diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 9aed36b96ce9..bcb7baa6c530 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -293,6 +293,27 @@ struct DirectMemoryAccessor { void Init(uptr begin, uptr end) {}; void *LoadPtr(uptr p) const { return *reinterpret_cast(p); } }; + +struct CopyMemoryAccessor { + void Init(uptr begin, uptr end) { + this->begin = begin; + buffer.clear(); + buffer.resize(end - begin); + MemCpyAccessible(buffer.data(), reinterpret_cast(begin), + buffer.size()); + }; + + void *LoadPtr(uptr p) const { + uptr offset = p - begin; + CHECK_LE(offset + sizeof(void *), reinterpret_cast(buffer.size())); + return *reinterpret_cast(offset + + reinterpret_cast(buffer.data())); + } + + private: + uptr begin; + InternalMmapVector buffer; +}; } // namespace // Scans the memory range, looking for byte patterns that point into allocator @@ -535,6 +556,7 @@ static void ProcessThread(tid_t os_id, uptr sp, static void ProcessThreads(SuspendedThreadsList const &suspended_threads, Frontier *frontier, tid_t caller_tid, uptr caller_sp) { + InternalMmapVector done_threads; InternalMmapVector registers; InternalMmapVector extra_ranges; for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) { @@ -559,6 +581,25 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads, DirectMemoryAccessor accessor; ProcessThread(os_id, sp, registers, extra_ranges, frontier, accessor); + if (flags()->use_detached) + done_threads.push_back(os_id); + } + + if (flags()->use_detached) { + CopyMemoryAccessor accessor; + InternalMmapVector known_threads; + GetRunningThreadsLocked(&known_threads); + Sort(done_threads.data(), done_threads.size()); + for (tid_t os_id : known_threads) { + registers.clear(); + extra_ranges.clear(); + + uptr i = InternalLowerBound(done_threads, os_id); + if (i >= done_threads.size() || done_threads[i] != os_id) { + uptr sp = (os_id == caller_tid) ? caller_sp : 0; + ProcessThread(os_id, sp, registers, extra_ranges, frontier, accessor); + } + } } // Add pointers reachable from ThreadContexts diff --git a/compiler-rt/lib/lsan/lsan_flags.inc b/compiler-rt/lib/lsan/lsan_flags.inc index c97b021ba5c0..e0b4aa4a3299 100644 --- a/compiler-rt/lib/lsan/lsan_flags.inc +++ b/compiler-rt/lib/lsan/lsan_flags.inc @@ -41,6 +41,8 @@ LSAN_FLAG(bool, use_ld_allocations, true, LSAN_FLAG(bool, use_unaligned, false, "Consider unaligned pointers valid.") LSAN_FLAG(bool, use_poisoned, false, "Consider pointers found in poisoned memory to be valid.") +LSAN_FLAG(bool, use_detached, false, + "Scan threads even if attaching to them failed.") LSAN_FLAG(bool, log_pointers, false, "Debug logging") LSAN_FLAG(bool, log_threads, false, "Debug logging") LSAN_FLAG(int, tries, 1, "Debug option to repeat leak checking multiple times") -- GitLab From 709116cb76803bdb897d191ef2d96ae19846ed81 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 18 Oct 2024 15:30:57 -0700 Subject: [PATCH 124/511] [llvm-c][MC] Expose color printing via LLVMSetDisasmOptions (#112980) --- llvm/include/llvm-c/Disassembler.h | 4 +++- llvm/lib/MC/MCDisassembler/Disassembler.cpp | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm-c/Disassembler.h b/llvm/include/llvm-c/Disassembler.h index b1cb35da6687..4bc6b04dd6ea 100644 --- a/llvm/include/llvm-c/Disassembler.h +++ b/llvm/include/llvm-c/Disassembler.h @@ -79,8 +79,10 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DC, uint64_t Options); #define LLVMDisassembler_Option_AsmPrinterVariant 4 /* The option to set comment on instructions */ #define LLVMDisassembler_Option_SetInstrComments 8 - /* The option to print latency information alongside instructions */ +/* The option to print latency information alongside instructions */ #define LLVMDisassembler_Option_PrintLatency 16 +/* The option to print in color */ +#define LLVMDisassembler_Option_Color 32 /** * Dispose of a disassembler context. diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp index 5e5a163c2902..f5d6c6bb5618 100644 --- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp +++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp @@ -277,6 +277,12 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes, SmallVector InsnStr; raw_svector_ostream OS(InsnStr); formatted_raw_ostream FormattedOS(OS); + + if (DC->getOptions() & LLVMDisassembler_Option_Color) { + FormattedOS.enable_colors(true); + IP->setUseColor(true); + } + IP->printInst(&Inst, PC, AnnotationsStr, *DC->getSubtargetInfo(), FormattedOS); @@ -343,5 +349,10 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){ DC->addOptions(LLVMDisassembler_Option_PrintLatency); Options &= ~LLVMDisassembler_Option_PrintLatency; } + if (Options & LLVMDisassembler_Option_Color) { + LLVMDisasmContext *DC = static_cast(DCR); + DC->addOptions(LLVMDisassembler_Option_Color); + Options &= ~LLVMDisassembler_Option_Color; + } return (Options == 0); } -- GitLab From 416731bf7fe4c44437765a467267a9cdff75bfcf Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 18 Oct 2024 15:32:23 -0700 Subject: [PATCH 125/511] [NvlinkWrapper] Use `-plugin-opt=mattr=` instead of a custom feature (#111712) Summary: We don't need a custom flag for this, LLVM had a way to get the features which are forwarded via `plugin-opt`. --- clang/lib/Driver/ToolChains/Cuda.cpp | 4 ++-- clang/test/Driver/cuda-cross-compiling.c | 2 +- clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp | 2 +- clang/tools/clang-nvlink-wrapper/NVLinkOpts.td | 3 --- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index b368c4737567..e9d2e3fe6d5c 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -634,8 +634,8 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, std::vector Features; getNVPTXTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args, Features); - for (StringRef Feature : Features) - CmdArgs.append({"--feature", Args.MakeArgString(Feature)}); + CmdArgs.push_back( + Args.MakeArgString("--plugin-opt=mattr=" + llvm::join(Features, ","))); // Add paths for the default clang library path. SmallString<256> DefaultLibPath = diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index 5f24e7a5accb..54c291fac66f 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -104,4 +104,4 @@ // RUN: %clang -target nvptx64-nvidia-cuda --cuda-feature=+ptx63 -march=sm_52 -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=FEATURE %s -// FEATURE: clang-nvlink-wrapper{{.*}}"--feature" "+ptx63" +// FEATURE: clang-nvlink-wrapper{{.*}}"--plugin-opt=mattr=+ptx63" diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp index b4b376fe0d11..b9767a7a03d0 100644 --- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp +++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp @@ -344,7 +344,7 @@ Expected> createLTO(const ArgList &Args) { Conf.RemarksHotnessThreshold = RemarksHotnessThreshold; Conf.RemarksFormat = RemarksFormat; - Conf.MAttrs = {Args.getLastArgValue(OPT_feature, "").str()}; + Conf.MAttrs = llvm::codegen::getMAttrs(); std::optional CGOptLevelOrNone = CodeGenOpt::parseLevel(Args.getLastArgValue(OPT_O, "2")[0]); assert(CGOptLevelOrNone && "Invalid optimization level"); diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td index eeb9d1a62282..a80c5937b429 100644 --- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td +++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td @@ -47,9 +47,6 @@ def arch : Separate<["--", "-"], "arch">, def : Joined<["--", "-"], "plugin-opt=mcpu=">, Flags<[HelpHidden, WrapperOnlyOption]>, Alias; -def feature : Separate<["--", "-"], "feature">, Flags<[WrapperOnlyOption]>, - HelpText<"Specify the '+ptx' freature to use for LTO.">; - def g : Flag<["-"], "g">, HelpText<"Specify that this was a debug compile.">; def debug : Flag<["--"], "debug">, Alias; -- GitLab From b35b5838094cdae897519a5f404a18e613041cff Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:45:00 -0700 Subject: [PATCH 126/511] [lld-macho] Fix category merging sed issue - Try nr.2 (#112981) We replace sed with awk as I couldn't find a syntax that works consistently on Linux/Mac for sed. Repro'ed original issue on Mac and confirmed working now on Mac/Linux. --- lld/test/MachO/objc-category-merging-minimal.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/MachO/objc-category-merging-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s index 437294791bf3..88c175333f26 100644 --- a/lld/test/MachO/objc-category-merging-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -30,7 +30,7 @@ ############ Test merging skipped due to invalid category name ############ # Modify __OBJC_$_CATEGORY_MyBaseClass_$_Category01's name to point to L_OBJC_IMAGE_INFO+3 -# RUN: sed -E '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { n; s/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/\t.quad\tL_OBJC_IMAGE_INFO+3/}' merge_cat_minimal.s > merge_cat_minimal_bad_name.s +# RUN: awk '/^__OBJC_\$_CATEGORY_MyBaseClass_\$_Category01:/ { print; getline; sub(/^[ \t]*\.quad[ \t]+l_OBJC_CLASS_NAME_$/, "\t.quad\tL_OBJC_IMAGE_INFO+3"); print; next } { print }' merge_cat_minimal.s > merge_cat_minimal_bad_name.s # Assemble the modified source # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal_bad_name.o merge_cat_minimal_bad_name.s -- GitLab From 0afe6e42fbab25b3b0d35921774bf2584bcd0d74 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Fri, 18 Oct 2024 15:48:15 -0700 Subject: [PATCH 127/511] [libc] Scanf shouldn't match just "0x" for hex int (#112440) Scanf parsing reads the longest possibly valid prefix for a given conversion. Then, it performs the conversion on that string. In the case of "0xZ" with a hex conversion (either "%x" or "%i") the longest possibly valid prefix is "0x", which makes it the "input item" (per the standard). The sequence "0x" is not a "matching sequence" for a hex conversion, meaning it results in a matching failure, and parsing ends. This is because to know that there's no valid digit after "0x" it reads the 'Z', but it can only put back one character (the 'Z') leaving it with consuming an invalid sequence. (inspired by a thread on the libc-coord mailing list: https://www.openwall.com/lists/libc-coord/2024/10/15/1, see 7.32.6.2 in the standard for more details.) --- libc/src/stdio/scanf_core/int_converter.cpp | 20 +++++++++--- libc/test/src/stdio/sscanf_test.cpp | 36 +++++++++++++++------ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/libc/src/stdio/scanf_core/int_converter.cpp b/libc/src/stdio/scanf_core/int_converter.cpp index 136db2a3773e..ecdac52e84bb 100644 --- a/libc/src/stdio/scanf_core/int_converter.cpp +++ b/libc/src/stdio/scanf_core/int_converter.cpp @@ -124,13 +124,24 @@ int convert_int(Reader *reader, const FormatSection &to_conv) { if (to_lower(cur_char) == 'x') { // This is a valid hex prefix. + + is_number = false; + // A valid hex prefix is not necessarily a valid number. For the + // conversion to be valid it needs to use all of the characters it + // consumes. From the standard: + // 7.23.6.2 paragraph 9: "An input item is defined as the longest + // sequence of input characters which does not exceed any specified + // field width and which is, or is a prefix of, a matching input + // sequence." + // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, + // the execution of the directive fails: this condition is a matching + // failure" base = 16; if (max_width > 1) { --max_width; cur_char = reader->getc(); } else { - write_int_with_length(0, to_conv); - return READ_OK; + return MATCHING_FAILURE; } } else { @@ -198,6 +209,9 @@ int convert_int(Reader *reader, const FormatSection &to_conv) { // last one back. reader->ungetc(cur_char); + if (!is_number) + return MATCHING_FAILURE; + if (has_overflow) { write_int_with_length(MAX, to_conv); } else { @@ -207,8 +221,6 @@ int convert_int(Reader *reader, const FormatSection &to_conv) { write_int_with_length(result, to_conv); } - if (!is_number) - return MATCHING_FAILURE; return READ_OK; } diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp index 33bb0acba3e6..18addb632067 100644 --- a/libc/test/src/stdio/sscanf_test.cpp +++ b/libc/test/src/stdio/sscanf_test.cpp @@ -177,13 +177,25 @@ TEST(LlvmLibcSScanfTest, IntConvMaxLengthTests) { EXPECT_EQ(ret_val, 1); EXPECT_EQ(result, 0); + result = -999; + + // 0x is a valid prefix, but not a valid number. This should be a matching + // failure and should not modify the values. ret_val = LIBC_NAMESPACE::sscanf("0x1", "%2i", &result); - EXPECT_EQ(ret_val, 1); - EXPECT_EQ(result, 0); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, -999); ret_val = LIBC_NAMESPACE::sscanf("-0x1", "%3i", &result); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, -999); + + ret_val = LIBC_NAMESPACE::sscanf("0x1", "%3i", &result); EXPECT_EQ(ret_val, 1); - EXPECT_EQ(result, 0); + EXPECT_EQ(result, 1); + + ret_val = LIBC_NAMESPACE::sscanf("-0x1", "%4i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, -1); ret_val = LIBC_NAMESPACE::sscanf("-0x123", "%4i", &result); EXPECT_EQ(ret_val, 1); @@ -212,7 +224,7 @@ TEST(LlvmLibcSScanfTest, IntConvNoWriteTests) { EXPECT_EQ(result, 0); ret_val = LIBC_NAMESPACE::sscanf("0x1", "%*2i", &result); - EXPECT_EQ(ret_val, 1); + EXPECT_EQ(ret_val, 0); EXPECT_EQ(result, 0); ret_val = LIBC_NAMESPACE::sscanf("a", "%*i", &result); @@ -679,13 +691,17 @@ TEST(LlvmLibcSScanfTest, CombinedConv) { EXPECT_EQ(result, 123); ASSERT_STREQ(buffer, "abc"); + result = -1; + + // 0x is a valid prefix, but not a valid number. This should be a matching + // failure and should not modify the values. ret_val = LIBC_NAMESPACE::sscanf("0xZZZ", "%i%s", &result, buffer); - EXPECT_EQ(ret_val, 2); - EXPECT_EQ(result, 0); - ASSERT_STREQ(buffer, "ZZZ"); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, -1); + ASSERT_STREQ(buffer, "abc"); ret_val = LIBC_NAMESPACE::sscanf("0xZZZ", "%X%s", &result, buffer); - EXPECT_EQ(ret_val, 2); - EXPECT_EQ(result, 0); - ASSERT_STREQ(buffer, "ZZZ"); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, -1); + ASSERT_STREQ(buffer, "abc"); } -- GitLab From 6d347fdfbd018b6555a754219fda461e166f2a64 Mon Sep 17 00:00:00 2001 From: OverMighty Date: Sat, 19 Oct 2024 01:10:32 +0200 Subject: [PATCH 128/511] [libc][math][c23] Add log2f16 C23 math function (#106084) Part of #95250. --- libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 22 ++++ libc/src/math/generic/expxf16.h | 14 ++ libc/src/math/generic/log2f16.cpp | 149 ++++++++++++++++++++++ libc/src/math/generic/logf16.cpp | 2 +- libc/src/math/log2f16.h | 21 +++ libc/test/src/math/CMakeLists.txt | 11 ++ libc/test/src/math/log2f16_test.cpp | 40 ++++++ libc/test/src/math/smoke/CMakeLists.txt | 13 ++ libc/test/src/math/smoke/log2f16_test.cpp | 50 ++++++++ 14 files changed, 326 insertions(+), 2 deletions(-) create mode 100644 libc/src/math/generic/log2f16.cpp create mode 100644 libc/src/math/log2f16.h create mode 100644 libc/test/src/math/log2f16_test.cpp create mode 100644 libc/test/src/math/smoke/log2f16_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index d9df737efea3..2cc54e8a4b97 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -567,6 +567,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llogbf16 libc.src.math.llrintf16 libc.src.math.llroundf16 + libc.src.math.log2f16 libc.src.math.logbf16 libc.src.math.logf16 libc.src.math.lrintf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 3d9d5a9e984c..06ea7bba81f3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -660,6 +660,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llogbf16 libc.src.math.llrintf16 libc.src.math.llroundf16 + libc.src.math.log2f16 libc.src.math.logbf16 libc.src.math.logf16 libc.src.math.lrintf16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index a4c59190a01b..6591cbbdc155 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -318,7 +318,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | log1p | |check| | |check| | | | | 7.12.6.14 | F.10.3.14 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| log2 | |check| | |check| | | | | 7.12.6.15 | F.10.3.15 | +| log2 | |check| | |check| | | |check| | | 7.12.6.15 | F.10.3.15 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | log2p1 | | | | | | 7.12.6.16 | F.10.3.16 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index cc49835ac7e1..d2a073847503 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -648,6 +648,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"log2", RetValSpec, [ArgSpec]>, FunctionSpec<"log2f", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"log2f16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"log", RetValSpec, [ArgSpec]>, FunctionSpec<"logf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9c6646cd658e..516bed499b19 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -340,6 +340,7 @@ add_math_entrypoint_object(log1pf) add_math_entrypoint_object(log2) add_math_entrypoint_object(log2f) +add_math_entrypoint_object(log2f16) add_math_entrypoint_object(log) add_math_entrypoint_object(logf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index b95672bc3968..d7c7a3431d3d 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2250,6 +2250,28 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + log2f16 + SRCS + log2f16.cpp + HDRS + ../log2f16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( log SRCS diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h index 357421958b4d..56ed6ee7cc00 100644 --- a/libc/src/math/generic/expxf16.h +++ b/libc/src/math/generic/expxf16.h @@ -302,6 +302,20 @@ constexpr cpp::array LOGF_F = { 0x1.41d8fep-1f, 0x1.4a4f86p-1f, 0x1.52a2d2p-1f, 0x1.5ad404p-1f, }; +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 31 do print(round(log2(1 + i * 2^-5), SG, RN)); +constexpr cpp::array LOG2F_F = { + 0x0p+0f, 0x1.6bad38p-5f, 0x1.663f7p-4f, 0x1.08c588p-3f, + 0x1.5c01a4p-3f, 0x1.acf5e2p-3f, 0x1.fbc16cp-3f, 0x1.24407ap-2f, + 0x1.49a784p-2f, 0x1.6e221cp-2f, 0x1.91bba8p-2f, 0x1.b47ecp-2f, + 0x1.d6753ep-2f, 0x1.f7a856p-2f, 0x1.0c105p-1f, 0x1.1bf312p-1f, + 0x1.2b8034p-1f, 0x1.3abb4p-1f, 0x1.49a784p-1f, 0x1.584822p-1f, + 0x1.66a008p-1f, 0x1.74b1fep-1f, 0x1.82809ep-1f, 0x1.900e62p-1f, + 0x1.9d5dap-1f, 0x1.aa709p-1f, 0x1.b74948p-1f, 0x1.c3e9cap-1f, + 0x1.d053f6p-1f, 0x1.dc899ap-1f, 0x1.e88c6cp-1f, 0x1.f45e08p-1f, +}; + // Generated by Sollya with the following commands: // > display = hexadecimal; // > for i from 0 to 31 do print(round(1 / (1 + i * 2^-5), SG, RN)); diff --git a/libc/src/math/generic/log2f16.cpp b/libc/src/math/generic/log2f16.cpp new file mode 100644 index 000000000000..ff4e0268b53d --- /dev/null +++ b/libc/src/math/generic/log2f16.cpp @@ -0,0 +1,149 @@ +//===-- Half-precision log2(x) function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/log2f16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" + +namespace LIBC_NAMESPACE_DECL { + +#ifdef LIBC_TARGET_CPU_HAS_FMA +static constexpr size_t N_LOG2F16_EXCEPTS = 2; +#else +static constexpr size_t N_LOG2F16_EXCEPTS = 9; +#endif + +static constexpr fputil::ExceptValues + LOG2F16_EXCEPTS = {{ +// (input, RZ output, RU offset, RD offset, RN offset) +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.224p-1, log2f16(x) = -0x1.a34p-1 (RZ) + {0x3889U, 0xba8dU, 0U, 1U, 0U}, + // x = 0x1.e34p-1, log2f16(x) = -0x1.558p-4 (RZ) + {0x3b8dU, 0xad56U, 0U, 1U, 0U}, +#endif + // x = 0x1.e8cp-1, log2f16(x) = -0x1.128p-4 (RZ) + {0x3ba3U, 0xac4aU, 0U, 1U, 0U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.f98p-1, log2f16(x) = -0x1.2ep-6 (RZ) + {0x3be6U, 0xa4b8U, 0U, 1U, 0U}, + // x = 0x1.facp-1, log2f16(x) = -0x1.e7p-7 (RZ) + {0x3bebU, 0xa39cU, 0U, 1U, 1U}, +#endif + // x = 0x1.fb4p-1, log2f16(x) = -0x1.b88p-7 (RZ) + {0x3bedU, 0xa2e2U, 0U, 1U, 1U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.fecp-1, log2f16(x) = -0x1.cep-9 (RZ) + {0x3bfbU, 0x9b38U, 0U, 1U, 1U}, + // x = 0x1.ffcp-1, log2f16(x) = -0x1.714p-11 (RZ) + {0x3bffU, 0x91c5U, 0U, 1U, 1U}, + // x = 0x1.224p+0, log2f16(x) = 0x1.72cp-3 (RZ) + {0x3c89U, 0x31cbU, 1U, 0U, 1U}, +#endif + }}; + +LLVM_LIBC_FUNCTION(float16, log2f16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + + // If x <= 0, or x is 1, or x is +inf, or x is NaN. + if (LIBC_UNLIKELY(x_u == 0U || x_u == 0x3c00U || x_u >= 0x7c00U)) { + // log2(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // log2(+/-0) = −inf + if ((x_u & 0x7fffU) == 0U) { + fputil::raise_except_if_required(FE_DIVBYZERO); + return FPBits::inf(Sign::NEG).get_val(); + } + + if (x_u == 0x3c00U) + return FPBits::zero().get_val(); + + // When x < 0. + if (x_u > 0x8000U) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // log2(+inf) = +inf + return FPBits::inf().get_val(); + } + + if (auto r = LOG2F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // To compute log2(x), we perform the following range reduction: + // x = 2^m * 1.mant, + // log2(x) = m + log2(1.mant). + // To compute log2(1.mant), let f be the highest 6 bits including the hidden + // bit, and d be the difference (1.mant - f), i.e., the remaining 5 bits of + // the mantissa, then: + // log2(1.mant) = log2(f) + log2(1.mant / f) + // = log2(f) + log2(1 + d/f) + // since d/f is sufficiently small. + // We store log2(f) and 1/f in the lookup tables LOG2F_F and ONE_OVER_F_F + // respectively. + + int m = -FPBits::EXP_BIAS; + + // When x is subnormal, normalize it. + if ((x_u & FPBits::EXP_MASK) == 0U) { + // Can't pass an integer to fputil::cast directly. + constexpr float NORMALIZE_EXP = 1U << FPBits::FRACTION_LEN; + x_bits = FPBits(x_bits.get_val() * fputil::cast(NORMALIZE_EXP)); + x_u = x_bits.uintval(); + m -= FPBits::FRACTION_LEN; + } + + uint16_t mant = x_bits.get_mantissa(); + // Leading 10 - 5 = 5 bits of the mantissa. + int f = mant >> 5; + // Unbiased exponent. + m += x_u >> FPBits::FRACTION_LEN; + + // Set bits to 1.mant instead of 2^m * 1.mant. + x_bits.set_biased_exponent(FPBits::EXP_BIAS); + float mant_f = x_bits.get_val(); + // v = 1.mant * 1/f - 1 = d/f + float v = fputil::multiply_add(mant_f, ONE_OVER_F_F[f], -1.0f); + + // Degree-3 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax(log2(1 + x)/x, 2, [|SG...|], [-2^-5, 2^-5]); + // > x * P; + float log2p1_d_over_f = + v * fputil::polyeval(v, 0x1.715476p+0f, -0x1.71771ap-1f, 0x1.ecb38ep-2f); + // log2(1.mant) = log2(f) + log2(1 + d/f) + float log2_1_mant = LOG2F_F[f] + log2p1_d_over_f; + return fputil::cast(static_cast(m) + log2_1_mant); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/logf16.cpp b/libc/src/math/generic/logf16.cpp index 735fec9681dd..802225a81055 100644 --- a/libc/src/math/generic/logf16.cpp +++ b/libc/src/math/generic/logf16.cpp @@ -115,7 +115,7 @@ LLVM_LIBC_FUNCTION(float16, logf16, (float16 x)) { // log(1.mant) = log(f) + log(1.mant / f) // = log(f) + log(1 + d/f) // since d/f is sufficiently small. - // We store log(f) and 1/f in the lookup tables LOGF_F and ONE_OVER_F + // We store log(f) and 1/f in the lookup tables LOGF_F and ONE_OVER_F_F // respectively. int m = -FPBits::EXP_BIAS; diff --git a/libc/src/math/log2f16.h b/libc/src/math/log2f16.h new file mode 100644 index 000000000000..d89f9f398e2a --- /dev/null +++ b/libc/src/math/log2f16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for log2f16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_LOG2F16_H +#define LLVM_LIBC_SRC_MATH_LOG2F16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 log2f16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_LOG2F16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 2d935f588488..24a5abec898a 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1809,6 +1809,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + log2f16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + log2f16_test.cpp + DEPENDS + libc.src.math.log2f16 +) + add_fp_unittest( log10_test NEED_MPFR diff --git a/libc/test/src/math/log2f16_test.cpp b/libc/test/src/math/log2f16_test.cpp new file mode 100644 index 000000000000..6630ca877d8d --- /dev/null +++ b/libc/test/src/math/log2f16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for log2f16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/log2f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcLog2f16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, + LIBC_NAMESPACE::log2f16(x), 0.5); + } +} + +TEST_F(LlvmLibcLog2f16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, + LIBC_NAMESPACE::log2f16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index a3cd671269ca..3c077240356b 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3595,6 +3595,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + log2f16_test + SUITE + libc-math-smoke-tests + SRCS + log2f16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.log2f16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( log10_test SUITE diff --git a/libc/test/src/math/smoke/log2f16_test.cpp b/libc/test/src/math/smoke/log2f16_test.cpp new file mode 100644 index 000000000000..6d98482aa449 --- /dev/null +++ b/libc/test/src/math/smoke/log2f16_test.cpp @@ -0,0 +1,50 @@ +//===-- Unittests for log2f16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/log2f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcLog2f16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log2f16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::log2f16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::log2f16(zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::log2f16(neg_zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + zero, + LIBC_NAMESPACE::log2f16(LIBC_NAMESPACE::fputil::cast(1.0))); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + aNaN, + LIBC_NAMESPACE::log2f16(LIBC_NAMESPACE::fputil::cast(-1.0))); + EXPECT_MATH_ERRNO(EDOM); +} -- GitLab From 1d09925b4a6fd4af0120825132be23be12fb03d6 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 18 Oct 2024 16:18:43 -0700 Subject: [PATCH 129/511] [SandboxVec][Scheduler] Boilerplate and initial implementation. (#112449) This patch implements a ready-list-based scheduler that operates on DependencyGraph. It is used by the sandbox vectorizer to test the legality of vectorizing a group of instrs. SchedBundle is a helper container, containing all DGNodes that correspond to the instructions that we are attempting to schedule with trySchedule(Instrs). --- .../SandboxVectorizer/DependencyGraph.h | 7 + .../Vectorize/SandboxVectorizer/Scheduler.h | 126 +++++++++++ llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../SandboxVectorizer/DependencyGraph.cpp | 10 +- .../Vectorize/SandboxVectorizer/Scheduler.cpp | 169 +++++++++++++++ .../SandboxVectorizer/CMakeLists.txt | 1 + .../SandboxVectorizer/DependencyGraphTest.cpp | 24 +++ .../SandboxVectorizer/SchedulerTest.cpp | 204 ++++++++++++++++++ 8 files changed, 540 insertions(+), 2 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp create mode 100644 llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index ae3ceed447c4..5be05bc80c49 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -113,8 +113,15 @@ public: virtual ~DGNode() = default; /// \Returns the number of unscheduled successors. unsigned getNumUnscheduledSuccs() const { return UnscheduledSuccs; } + void decrUnscheduledSuccs() { + assert(UnscheduledSuccs > 0 && "Counting error!"); + --UnscheduledSuccs; + } + /// \Returns true if all dependent successors have been scheduled. + bool ready() const { return UnscheduledSuccs == 0; } /// \Returns true if this node has been scheduled. bool scheduled() const { return Scheduled; } + void setScheduled(bool NewVal) { Scheduled = NewVal; } /// \Returns true if this is before \p Other in program order. bool comesBefore(const DGNode *Other) { return I->comesBefore(Other->I); } using iterator = PredIterator; diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h new file mode 100644 index 000000000000..08972d460b40 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h @@ -0,0 +1,126 @@ +//===- Scheduler.h ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the bottom-up list scheduler used by the vectorizer. It is used for +// checking the legality of vectorization and for scheduling instructions in +// such a way that makes vectorization possible, if legal. +// +// The legality check is performed by `trySchedule(Instrs)`, which will try to +// schedule the IR until all instructions in `Instrs` can be scheduled together +// back-to-back. If this fails then it is illegal to vectorize `Instrs`. +// +// Internally the scheduler uses the vectorizer-specific DependencyGraph class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SCHEDULER_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SCHEDULER_H + +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h" +#include + +namespace llvm::sandboxir { + +class PriorityCmp { +public: + bool operator()(const DGNode *N1, const DGNode *N2) { + // TODO: This should be a hierarchical comparator. + return N1->getInstruction()->comesBefore(N2->getInstruction()); + } +}; + +/// The list holding nodes that are ready to schedule. Used by the scheduler. +class ReadyListContainer { + PriorityCmp Cmp; + /// Control/Other dependencies are not modeled by the DAG to save memory. + /// These have to be modeled in the ready list for correctness. + /// This means that the list will hold back nodes that need to meet such + /// unmodeled dependencies. + std::priority_queue, PriorityCmp> List; + +public: + ReadyListContainer() : List(Cmp) {} + void insert(DGNode *N) { List.push(N); } + DGNode *pop() { + auto *Back = List.top(); + List.pop(); + return Back; + } + bool empty() const { return List.empty(); } +#ifndef NDEBUG + void dump(raw_ostream &OS) const; + LLVM_DUMP_METHOD void dump() const; +#endif // NDEBUG +}; + +/// The nodes that need to be scheduled back-to-back in a single scheduling +/// cycle form a SchedBundle. +class SchedBundle { +public: + using ContainerTy = SmallVector; + +private: + ContainerTy Nodes; + +public: + SchedBundle() = default; + SchedBundle(ContainerTy &&Nodes) : Nodes(std::move(Nodes)) {} + using iterator = ContainerTy::iterator; + using const_iterator = ContainerTy::const_iterator; + iterator begin() { return Nodes.begin(); } + iterator end() { return Nodes.end(); } + const_iterator begin() const { return Nodes.begin(); } + const_iterator end() const { return Nodes.end(); } + /// \Returns the bundle node that comes before the others in program order. + DGNode *getTop() const; + /// \Returns the bundle node that comes after the others in program order. + DGNode *getBot() const; + /// Move all bundle instructions to \p Where back-to-back. + void cluster(BasicBlock::iterator Where); +#ifndef NDEBUG + void dump(raw_ostream &OS) const; + LLVM_DUMP_METHOD void dump() const; +#endif +}; + +/// The list scheduler. +class Scheduler { + ReadyListContainer ReadyList; + DependencyGraph DAG; + std::optional ScheduleTopItOpt; + SmallVector> Bndls; + + /// \Returns a scheduling bundle containing \p Instrs. + SchedBundle *createBundle(ArrayRef Instrs); + /// Schedule nodes until we can schedule \p Instrs back-to-back. + bool tryScheduleUntil(ArrayRef Instrs); + /// Schedules all nodes in \p Bndl, marks them as scheduled, updates the + /// UnscheduledSuccs counter of all dependency predecessors, and adds any of + /// them that become ready to the ready list. + void scheduleAndUpdateReadyList(SchedBundle &Bndl); + + /// Disable copies. + Scheduler(const Scheduler &) = delete; + Scheduler &operator=(const Scheduler &) = delete; + +public: + Scheduler(AAResults &AA) : DAG(AA) {} + ~Scheduler() {} + + bool trySchedule(ArrayRef Instrs); + +#ifndef NDEBUG + void dump(raw_ostream &OS) const; + LLVM_DUMP_METHOD void dump() const; +#endif +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SCHEDULER_H diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index f4e98e576379..fc4355af5af6 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_component_library(LLVMVectorize SandboxVectorizer/Passes/RegionsFromMetadata.cpp SandboxVectorizer/SandboxVectorizer.cpp SandboxVectorizer/SandboxVectorizerPassBuilder.cpp + SandboxVectorizer/Scheduler.cpp SandboxVectorizer/SeedCollector.cpp SLPVectorizer.cpp Vectorize.cpp diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9bbeca4fc154..6217c9fecf45 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -60,7 +60,7 @@ bool PredIterator::operator==(const PredIterator &Other) const { #ifndef NDEBUG void DGNode::print(raw_ostream &OS, bool PrintDeps) const { - OS << *I << " USuccs:" << UnscheduledSuccs << "\n"; + OS << *I << " USuccs:" << UnscheduledSuccs << " Sched:" << Scheduled << "\n"; } void DGNode::dump() const { print(dbgs()); } void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { @@ -249,6 +249,10 @@ void DependencyGraph::setDefUseUnscheduledSuccs( // Walk over all instructions in "BotInterval" and update the counter // of operands that are in "TopInterval". for (Instruction &BotI : BotInterval) { + auto *BotN = getNode(&BotI); + // Skip scheduled nodes. + if (BotN->scheduled()) + continue; for (Value *Op : BotI.operands()) { auto *OpI = dyn_cast(Op); if (OpI == nullptr) @@ -286,7 +290,9 @@ void DependencyGraph::createNewNodes(const Interval &NewInterval) { MemDGNodeIntervalBuilder::getBotMemDGNode(TopInterval, *this); MemDGNode *LinkBotN = MemDGNodeIntervalBuilder::getTopMemDGNode(BotInterval, *this); - assert(LinkTopN->comesBefore(LinkBotN) && "Wrong order!"); + assert((LinkTopN == nullptr || LinkBotN == nullptr || + LinkTopN->comesBefore(LinkBotN)) && + "Wrong order!"); if (LinkTopN != nullptr && LinkBotN != nullptr) { LinkTopN->setNextNode(LinkBotN); LinkBotN->setPrevNode(LinkTopN); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp new file mode 100644 index 000000000000..6140c2a8dcec --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp @@ -0,0 +1,169 @@ +//===- Scheduler.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h" + +namespace llvm::sandboxir { + +// TODO: Check if we can cache top/bottom to reduce compile-time. +DGNode *SchedBundle::getTop() const { + DGNode *TopN = Nodes.front(); + for (auto *N : drop_begin(Nodes)) { + if (N->getInstruction()->comesBefore(TopN->getInstruction())) + TopN = N; + } + return TopN; +} + +DGNode *SchedBundle::getBot() const { + DGNode *BotN = Nodes.front(); + for (auto *N : drop_begin(Nodes)) { + if (BotN->getInstruction()->comesBefore(N->getInstruction())) + BotN = N; + } + return BotN; +} + +void SchedBundle::cluster(BasicBlock::iterator Where) { + for (auto *N : Nodes) { + auto *I = N->getInstruction(); + if (I->getIterator() == Where) + ++Where; // Try to maintain bundle order. + I->moveBefore(*Where.getNodeParent(), Where); + } +} + +#ifndef NDEBUG +void SchedBundle::dump(raw_ostream &OS) const { + for (auto *N : Nodes) + OS << *N; +} + +void SchedBundle::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG + +#ifndef NDEBUG +void ReadyListContainer::dump(raw_ostream &OS) const { + auto ListCopy = List; + while (!ListCopy.empty()) { + OS << *ListCopy.top() << "\n"; + ListCopy.pop(); + } +} + +void ReadyListContainer::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG + +void Scheduler::scheduleAndUpdateReadyList(SchedBundle &Bndl) { + // Find where we should schedule the instructions. + assert(ScheduleTopItOpt && "Should have been set by now!"); + auto Where = *ScheduleTopItOpt; + // Move all instructions in `Bndl` to `Where`. + Bndl.cluster(Where); + // Update the last scheduled bundle. + ScheduleTopItOpt = Bndl.getTop()->getInstruction()->getIterator(); + // Set nodes as "scheduled" and decrement the UnsceduledSuccs counter of all + // dependency predecessors. + for (DGNode *N : Bndl) { + N->setScheduled(true); + for (auto *DepN : N->preds(DAG)) { + // TODO: preds() should not return nullptr. + if (DepN == nullptr) + continue; + DepN->decrUnscheduledSuccs(); + if (DepN->ready()) + ReadyList.insert(DepN); + } + } +} + +SchedBundle *Scheduler::createBundle(ArrayRef Instrs) { + SchedBundle::ContainerTy Nodes; + Nodes.reserve(Instrs.size()); + for (auto *I : Instrs) + Nodes.push_back(DAG.getNode(I)); + auto BndlPtr = std::make_unique(std::move(Nodes)); + auto *Bndl = BndlPtr.get(); + Bndls.push_back(std::move(BndlPtr)); + return Bndl; +} + +bool Scheduler::tryScheduleUntil(ArrayRef Instrs) { + // Use a set of instructions, instead of `Instrs` for fast lookups. + DenseSet InstrsToDefer(Instrs.begin(), Instrs.end()); + // This collects the nodes that correspond to instructions found in `Instrs` + // that have just become ready. These nodes won't be scheduled right away. + SmallVector DeferredNodes; + + // Keep scheduling ready nodes until we either run out of ready nodes (i.e., + // ReadyList is empty), or all nodes that correspond to `Instrs` (the nodes of + // which are collected in DeferredNodes) are all ready to schedule. + while (!ReadyList.empty()) { + auto *ReadyN = ReadyList.pop(); + if (InstrsToDefer.contains(ReadyN->getInstruction())) { + // If the ready instruction is one of those in `Instrs`, then we don't + // schedule it right away. Instead we defer it until we can schedule it + // along with the rest of the instructions in `Instrs`, at the same + // time in a single scheduling bundle. + DeferredNodes.push_back(ReadyN); + bool ReadyToScheduleDeferred = DeferredNodes.size() == Instrs.size(); + if (ReadyToScheduleDeferred) { + scheduleAndUpdateReadyList(*createBundle(Instrs)); + return true; + } + } else { + // If the ready instruction is not found in `Instrs`, then we wrap it in a + // scheduling bundle and schedule it right away. + scheduleAndUpdateReadyList(*createBundle({ReadyN->getInstruction()})); + } + } + assert(DeferredNodes.size() != Instrs.size() && + "We should have succesfully scheduled and early-returned!"); + return false; +} + +bool Scheduler::trySchedule(ArrayRef Instrs) { + assert(all_of(drop_begin(Instrs), + [Instrs](Instruction *I) { + return I->getParent() == (*Instrs.begin())->getParent(); + }) && + "Instrs not in the same BB!"); + // Extend the DAG to include Instrs. + Interval Extension = DAG.extend(Instrs); + // TODO: Set the window of the DAG that we are interested in. + // We start scheduling at the bottom instr of Instrs. + auto getBottomI = [](ArrayRef Instrs) -> Instruction * { + return *min_element(Instrs, + [](auto *I1, auto *I2) { return I1->comesBefore(I2); }); + }; + ScheduleTopItOpt = std::next(getBottomI(Instrs)->getIterator()); + // Add nodes to ready list. + for (auto &I : Extension) { + auto *N = DAG.getNode(&I); + if (N->ready()) + ReadyList.insert(N); + } + // Try schedule all nodes until we can schedule Instrs back-to-back. + return tryScheduleUntil(Instrs); +} + +#ifndef NDEBUG +void Scheduler::dump(raw_ostream &OS) const { + OS << "ReadyList:\n"; + ReadyList.dump(OS); +} +void Scheduler::dump() const { dump(dbgs()); } +#endif // NDEBUG + +} // namespace llvm::sandboxir diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt index dcd7232db5f6..24512cb0225e 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt @@ -11,5 +11,6 @@ add_llvm_unittest(SandboxVectorizerTests DependencyGraphTest.cpp IntervalTest.cpp LegalityTest.cpp + SchedulerTest.cpp SeedCollectorTest.cpp ) diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index 3f84ad1f731d..061d57c31ce2 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -254,6 +254,18 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) { EXPECT_EQ(N0->getNumUnscheduledSuccs(), 1u); // N1 EXPECT_EQ(N1->getNumUnscheduledSuccs(), 0u); EXPECT_EQ(N2->getNumUnscheduledSuccs(), 0u); + + // Check decrUnscheduledSuccs. + N0->decrUnscheduledSuccs(); + EXPECT_EQ(N0->getNumUnscheduledSuccs(), 0u); +#ifndef NDEBUG + EXPECT_DEATH(N0->decrUnscheduledSuccs(), ".*Counting.*"); +#endif // NDEBUG + + // Check scheduled(), setScheduled(). + EXPECT_FALSE(N0->scheduled()); + N0->setScheduled(true); + EXPECT_TRUE(N0->scheduled()); } TEST_F(DependencyGraphTest, Preds) { @@ -773,4 +785,16 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %v5) { EXPECT_EQ(S4N->getNumUnscheduledSuccs(), 1u); // S5N EXPECT_EQ(S5N->getNumUnscheduledSuccs(), 0u); } + + { + // Check UnscheduledSuccs when a node is scheduled + sandboxir::DependencyGraph DAG(getAA(*LLVMF)); + DAG.extend({S2, S2}); + auto *S2N = cast(DAG.getNode(S2)); + S2N->setScheduled(true); + + DAG.extend({S1, S1}); + auto *S1N = cast(DAG.getNode(S1)); + EXPECT_EQ(S1N->getNumUnscheduledSuccs(), 0u); // S1 is scheduled + } } diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp new file mode 100644 index 000000000000..92e767e55fbd --- /dev/null +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp @@ -0,0 +1,204 @@ +//===- SchedulerTest.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Dominators.h" +#include "llvm/SandboxIR/Context.h" +#include "llvm/SandboxIR/Function.h" +#include "llvm/SandboxIR/Instruction.h" +#include "llvm/Support/SourceMgr.h" +#include "gmock/gmock-matchers.h" +#include "gtest/gtest.h" + +using namespace llvm; + +struct SchedulerTest : public testing::Test { + LLVMContext C; + std::unique_ptr M; + std::unique_ptr AC; + std::unique_ptr DT; + std::unique_ptr BAA; + std::unique_ptr AA; + + void parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + M = parseAssemblyString(IR, Err, C); + if (!M) + Err.print("SchedulerTest", errs()); + } + + AAResults &getAA(llvm::Function &LLVMF) { + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI(TLII); + AA = std::make_unique(TLI); + AC = std::make_unique(LLVMF); + DT = std::make_unique(LLVMF); + BAA = std::make_unique(M->getDataLayout(), LLVMF, TLI, *AC, + DT.get()); + AA->addAAResult(*BAA); + return *AA; + } +}; + +TEST_F(SchedulerTest, SchedBundle) { + parseIR(C, R"IR( +define void @foo(ptr %ptr, i8 %v0, i8 %v1) { + store i8 %v0, ptr %ptr + %other = add i8 %v0, %v1 + store i8 %v1, ptr %ptr + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *S0 = cast(&*It++); + auto *Other = &*It++; + auto *S1 = cast(&*It++); + auto *Ret = cast(&*It++); + + sandboxir::DependencyGraph DAG(getAA(*LLVMF)); + DAG.extend({&*BB->begin(), BB->getTerminator()}); + auto *SN0 = DAG.getNode(S0); + auto *SN1 = DAG.getNode(S1); + sandboxir::SchedBundle Bndl({SN0, SN1}); + + // Check getTop(). + EXPECT_EQ(Bndl.getTop(), SN0); + // Check getBot(). + EXPECT_EQ(Bndl.getBot(), SN1); + // Check cluster(). + Bndl.cluster(S1->getIterator()); + { + auto It = BB->begin(); + EXPECT_EQ(&*It++, Other); + EXPECT_EQ(&*It++, S0); + EXPECT_EQ(&*It++, S1); + EXPECT_EQ(&*It++, Ret); + S0->moveBefore(Other); + } + + Bndl.cluster(S0->getIterator()); + { + auto It = BB->begin(); + EXPECT_EQ(&*It++, S0); + EXPECT_EQ(&*It++, S1); + EXPECT_EQ(&*It++, Other); + EXPECT_EQ(&*It++, Ret); + S1->moveAfter(Other); + } + + Bndl.cluster(Other->getIterator()); + { + auto It = BB->begin(); + EXPECT_EQ(&*It++, S0); + EXPECT_EQ(&*It++, S1); + EXPECT_EQ(&*It++, Other); + EXPECT_EQ(&*It++, Ret); + S1->moveAfter(Other); + } + + Bndl.cluster(Ret->getIterator()); + { + auto It = BB->begin(); + EXPECT_EQ(&*It++, Other); + EXPECT_EQ(&*It++, S0); + EXPECT_EQ(&*It++, S1); + EXPECT_EQ(&*It++, Ret); + Other->moveBefore(S1); + } + + Bndl.cluster(BB->end()); + { + auto It = BB->begin(); + EXPECT_EQ(&*It++, Other); + EXPECT_EQ(&*It++, Ret); + EXPECT_EQ(&*It++, S0); + EXPECT_EQ(&*It++, S1); + Ret->moveAfter(S1); + Other->moveAfter(S0); + } + // Check iterators. + EXPECT_THAT(Bndl, testing::ElementsAre(SN0, SN1)); + EXPECT_THAT((const sandboxir::SchedBundle &)Bndl, + testing::ElementsAre(SN0, SN1)); +} + +TEST_F(SchedulerTest, Basic) { + parseIR(C, R"IR( +define void @foo(ptr %ptr, i8 %v0, i8 %v1) { + store i8 %v0, ptr %ptr + store i8 %v1, ptr %ptr + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *S0 = cast(&*It++); + auto *S1 = cast(&*It++); + auto *Ret = cast(&*It++); + + { + // Schedule all instructions in sequence. + sandboxir::Scheduler Sched(getAA(*LLVMF)); + EXPECT_TRUE(Sched.trySchedule({Ret})); + EXPECT_TRUE(Sched.trySchedule({S1})); + EXPECT_TRUE(Sched.trySchedule({S0})); + } + { + // Skip instructions. + sandboxir::Scheduler Sched(getAA(*LLVMF)); + EXPECT_TRUE(Sched.trySchedule({Ret})); + EXPECT_TRUE(Sched.trySchedule({S0})); + } + { + // Try invalid scheduling + sandboxir::Scheduler Sched(getAA(*LLVMF)); + EXPECT_TRUE(Sched.trySchedule({Ret})); + EXPECT_TRUE(Sched.trySchedule({S0})); + EXPECT_FALSE(Sched.trySchedule({S1})); + } +} + +TEST_F(SchedulerTest, Bundles) { + parseIR(C, R"IR( +define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) { + %ld0 = load i8, ptr %ptr0 + %ld1 = load i8, ptr %ptr1 + store i8 %ld0, ptr %ptr0 + store i8 %ld1, ptr %ptr1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *L0 = cast(&*It++); + auto *L1 = cast(&*It++); + auto *S0 = cast(&*It++); + auto *S1 = cast(&*It++); + auto *Ret = cast(&*It++); + + sandboxir::Scheduler Sched(getAA(*LLVMF)); + EXPECT_TRUE(Sched.trySchedule({Ret})); + EXPECT_TRUE(Sched.trySchedule({S0, S1})); + EXPECT_TRUE(Sched.trySchedule({L0, L1})); +} -- GitLab From f5bd36aece8f6b12422ce30903dd78d1b5006efd Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 18 Oct 2024 23:18:50 +0000 Subject: [PATCH 130/511] [gn build] Port 1d09925b4a6f --- llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 1 + .../unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 85dfd7738c17..d54b12e3a20d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -19,6 +19,7 @@ static_library("Vectorize") { "SandboxVectorizer/Passes/RegionsFromMetadata.cpp", "SandboxVectorizer/SandboxVectorizer.cpp", "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp", + "SandboxVectorizer/Scheduler.cpp", "SandboxVectorizer/SeedCollector.cpp", "VPlan.cpp", "VPlanAnalysis.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn index a01525a0c80b..44640c6527c9 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn @@ -13,5 +13,6 @@ unittest("SandboxVectorizerTests") { "DependencyGraphTest.cpp", "IntervalTest.cpp", "LegalityTest.cpp", + "SchedulerTest.cpp", ] } -- GitLab From 0138adb68fc20c2fd1a368ca3a2e531debed3852 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 16:35:59 -0700 Subject: [PATCH 131/511] [nfc][lsan] Rename `ScanExtraStack` and pass `region_type` (#113004) --- compiler-rt/lib/lsan/lsan_common.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index bcb7baa6c530..25d79544b2f3 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -394,10 +394,10 @@ void ScanGlobalRange(uptr begin, uptr end, Frontier *frontier) { } template -void ScanExtraStack(const InternalMmapVector &ranges, Frontier *frontier, - Accessor &accessor) { +void ScanRanges(const InternalMmapVector &ranges, Frontier *frontier, + const char *region_type, Accessor &accessor) { for (uptr i = 0; i < ranges.size(); i++) { - ScanForPointers(ranges[i].begin, ranges[i].end, frontier, "FAKE STACK", + ScanForPointers(ranges[i].begin, ranges[i].end, frontier, region_type, kReachable, accessor); } } @@ -405,7 +405,7 @@ void ScanExtraStack(const InternalMmapVector &ranges, Frontier *frontier, void ScanExtraStackRanges(const InternalMmapVector &ranges, Frontier *frontier) { DirectMemoryAccessor accessor; - ScanExtraStack(ranges, frontier, accessor); + ScanRanges(ranges, frontier, "FAKE STACK", accessor); } # if SANITIZER_FUCHSIA @@ -499,7 +499,7 @@ static void ProcessThread(tid_t os_id, uptr sp, ScanForPointers(stack_begin, stack_end, frontier, "STACK", kReachable, accessor); GetThreadExtraStackRangesLocked(os_id, &extra_ranges); - ScanExtraStack(extra_ranges, frontier, accessor); + ScanRanges(extra_ranges, frontier, "FAKE STACK", accessor); } if (flags()->use_tls) { -- GitLab From 69d3a44eded0b0792c8d69e830579f84b8e81eeb Mon Sep 17 00:00:00 2001 From: OverMighty Date: Sat, 19 Oct 2024 01:40:40 +0200 Subject: [PATCH 132/511] [libc][math][c23] Add log10f16 C23 math function (#106091) Part of #95250. --- libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 22 +++ libc/src/math/generic/expxf16.h | 14 ++ libc/src/math/generic/log10f16.cpp | 164 +++++++++++++++++++++ libc/src/math/log10f16.h | 21 +++ libc/test/src/math/CMakeLists.txt | 11 ++ libc/test/src/math/log10f16_test.cpp | 40 +++++ libc/test/src/math/smoke/CMakeLists.txt | 13 ++ libc/test/src/math/smoke/log10f16_test.cpp | 50 +++++++ 13 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 libc/src/math/generic/log10f16.cpp create mode 100644 libc/src/math/log10f16.h create mode 100644 libc/test/src/math/log10f16_test.cpp create mode 100644 libc/test/src/math/smoke/log10f16_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 2cc54e8a4b97..13bb88894297 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -567,6 +567,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llogbf16 libc.src.math.llrintf16 libc.src.math.llroundf16 + libc.src.math.log10f16 libc.src.math.log2f16 libc.src.math.logbf16 libc.src.math.logf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 06ea7bba81f3..6ed6ad8c2400 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -660,6 +660,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.llogbf16 libc.src.math.llrintf16 libc.src.math.llroundf16 + libc.src.math.log10f16 libc.src.math.log2f16 libc.src.math.logbf16 libc.src.math.logf16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 6591cbbdc155..88751e2453f2 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -312,7 +312,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | log | |check| | |check| | | |check| | | 7.12.6.11 | F.10.3.11 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| log10 | |check| | |check| | | | | 7.12.6.12 | F.10.3.12 | +| log10 | |check| | |check| | | |check| | | 7.12.6.12 | F.10.3.12 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | log10p1 | | | | | | 7.12.6.13 | F.10.3.13 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index d2a073847503..33bec2f71627 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -642,6 +642,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"log10", RetValSpec, [ArgSpec]>, FunctionSpec<"log10f", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"log10f16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"log1p", RetValSpec, [ArgSpec]>, FunctionSpec<"log1pf", RetValSpec, [ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 516bed499b19..9239f029abf8 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -334,6 +334,7 @@ add_math_entrypoint_object(ldexpf128) add_math_entrypoint_object(log10) add_math_entrypoint_object(log10f) +add_math_entrypoint_object(log10f16) add_math_entrypoint_object(log1p) add_math_entrypoint_object(log1pf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index d7c7a3431d3d..00cad6f85b4b 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2173,6 +2173,28 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + log10f16 + SRCS + log10f16.cpp + HDRS + ../log10f16.h + DEPENDS + .expxf16 + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.cpu_features + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( log1p SRCS diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h index 56ed6ee7cc00..67bb24830751 100644 --- a/libc/src/math/generic/expxf16.h +++ b/libc/src/math/generic/expxf16.h @@ -316,6 +316,20 @@ constexpr cpp::array LOG2F_F = { 0x1.d053f6p-1f, 0x1.dc899ap-1f, 0x1.e88c6cp-1f, 0x1.f45e08p-1f, }; +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 31 do print(round(log10(1 + i * 2^-5), SG, RN)); +constexpr cpp::array LOG10F_F = { + 0x0p+0f, 0x1.b5e908p-7f, 0x1.af5f92p-6f, 0x1.3ed11ap-5f, + 0x1.a30a9ep-5f, 0x1.02428cp-4f, 0x1.31b306p-4f, 0x1.5fe804p-4f, + 0x1.8cf184p-4f, 0x1.b8de4ep-4f, 0x1.e3bc1ap-4f, 0x1.06cbd6p-3f, + 0x1.1b3e72p-3f, 0x1.2f3b6ap-3f, 0x1.42c7e8p-3f, 0x1.55e8c6p-3f, + 0x1.68a288p-3f, 0x1.7af974p-3f, 0x1.8cf184p-3f, 0x1.9e8e7cp-3f, + 0x1.afd3e4p-3f, 0x1.c0c514p-3f, 0x1.d1653p-3f, 0x1.e1b734p-3f, + 0x1.f1bdeep-3f, 0x1.00be06p-2f, 0x1.087a08p-2f, 0x1.101432p-2f, + 0x1.178da6p-2f, 0x1.1ee778p-2f, 0x1.2622bp-2f, 0x1.2d404cp-2f, +}; + // Generated by Sollya with the following commands: // > display = hexadecimal; // > for i from 0 to 31 do print(round(1 / (1 + i * 2^-5), SG, RN)); diff --git a/libc/src/math/generic/log10f16.cpp b/libc/src/math/generic/log10f16.cpp new file mode 100644 index 000000000000..990bcabaf687 --- /dev/null +++ b/libc/src/math/generic/log10f16.cpp @@ -0,0 +1,164 @@ +//===-- Half-precision log10(x) function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/log10f16.h" +#include "expxf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" + +namespace LIBC_NAMESPACE_DECL { + +#ifdef LIBC_TARGET_CPU_HAS_FMA +static constexpr size_t N_LOG10F16_EXCEPTS = 11; +#else +static constexpr size_t N_LOG10F16_EXCEPTS = 17; +#endif + +static constexpr fputil::ExceptValues + LOG10F16_EXCEPTS = {{ + // (input, RZ output, RU offset, RD offset, RN offset) + // x = 0x1.e3cp-3, log10f16(x) = -0x1.40cp-1 (RZ) + {0x338fU, 0xb903U, 0U, 1U, 0U}, + // x = 0x1.fep-3, log10f16(x) = -0x1.35p-1 (RZ) + {0x33f8U, 0xb8d4U, 0U, 1U, 1U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA + // x = 0x1.394p-1, log10f16(x) = -0x1.b4cp-3 (RZ) + {0x38e5U, 0xb2d3U, 0U, 1U, 1U}, +#endif + // x = 0x1.ea8p-1, log10f16(x) = -0x1.31p-6 (RZ) + {0x3baaU, 0xa4c4U, 0U, 1U, 1U}, + // x = 0x1.ebp-1, log10f16(x) = -0x1.29cp-6 (RZ) + {0x3bacU, 0xa4a7U, 0U, 1U, 1U}, + // x = 0x1.f3p-1, log10f16(x) = -0x1.6dcp-7 (RZ) + {0x3bccU, 0xa1b7U, 0U, 1U, 1U}, +// x = 0x1.f38p-1, log10f16(x) = -0x1.5f8p-7 (RZ) +#ifndef LIBC_TARGET_CPU_HAS_FMA + {0x3bceU, 0xa17eU, 0U, 1U, 1U}, + // x = 0x1.fd8p-1, log10f16(x) = -0x1.168p-9 (RZ) + {0x3bf6U, 0x985aU, 0U, 1U, 1U}, + // x = 0x1.ff8p-1, log10f16(x) = -0x1.bccp-12 (RZ) + {0x3bfeU, 0x8ef3U, 0U, 1U, 1U}, + // x = 0x1.374p+0, log10f16(x) = 0x1.5b8p-4 (RZ) + {0x3cddU, 0x2d6eU, 1U, 0U, 1U}, + // x = 0x1.3ecp+1, log10f16(x) = 0x1.958p-2 (RZ) + {0x40fbU, 0x3656U, 1U, 0U, 1U}, +#endif + // x = 0x1.4p+3, log10f16(x) = 0x1p+0 (RZ) + {0x4900U, 0x3c00U, 0U, 0U, 0U}, + // x = 0x1.9p+6, log10f16(x) = 0x1p+1 (RZ) + {0x5640U, 0x4000U, 0U, 0U, 0U}, + // x = 0x1.f84p+6, log10f16(x) = 0x1.0ccp+1 (RZ) + {0x57e1U, 0x4033U, 1U, 0U, 0U}, + // x = 0x1.f4p+9, log10f16(x) = 0x1.8p+1 (RZ) + {0x63d0U, 0x4200U, 0U, 0U, 0U}, + // x = 0x1.388p+13, log10f16(x) = 0x1p+2 (RZ) + {0x70e2U, 0x4400U, 0U, 0U, 0U}, + // x = 0x1.674p+13, log10f16(x) = 0x1.03cp+2 (RZ) + {0x719dU, 0x440fU, 1U, 0U, 0U}, + }}; + +LLVM_LIBC_FUNCTION(float16, log10f16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + + // If x <= 0, or x is 1, or x is +inf, or x is NaN. + if (LIBC_UNLIKELY(x_u == 0U || x_u == 0x3c00U || x_u >= 0x7c00U)) { + // log10(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // log10(+/-0) = −inf + if ((x_u & 0x7fffU) == 0U) { + fputil::raise_except_if_required(FE_DIVBYZERO); + return FPBits::inf(Sign::NEG).get_val(); + } + + if (x_u == 0x3c00U) + return FPBits::zero().get_val(); + + // When x < 0. + if (x_u > 0x8000U) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // log10(+inf) = +inf + return FPBits::inf().get_val(); + } + + if (auto r = LOG10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // To compute log10(x), we perform the following range reduction: + // x = 2^m * 1.mant, + // log10(x) = m * log10(2) + log10(1.mant). + // To compute log10(1.mant), let f be the highest 6 bits including the hidden + // bit, and d be the difference (1.mant - f), i.e., the remaining 5 bits of + // the mantissa, then: + // log10(1.mant) = log10(f) + log10(1.mant / f) + // = log10(f) + log10(1 + d/f) + // since d/f is sufficiently small. + // We store log10(f) and 1/f in the lookup tables LOG10F_F and ONE_OVER_F_F + // respectively. + + int m = -FPBits::EXP_BIAS; + + // When x is subnormal, normalize it. + if ((x_u & FPBits::EXP_MASK) == 0U) { + // Can't pass an integer to fputil::cast directly. + constexpr float NORMALIZE_EXP = 1U << FPBits::FRACTION_LEN; + x_bits = FPBits(x_bits.get_val() * fputil::cast(NORMALIZE_EXP)); + x_u = x_bits.uintval(); + m -= FPBits::FRACTION_LEN; + } + + uint16_t mant = x_bits.get_mantissa(); + // Leading 10 - 5 = 5 bits of the mantissa. + int f = mant >> 5; + // Unbiased exponent. + m += x_u >> FPBits::FRACTION_LEN; + + // Set bits to 1.mant instead of 2^m * 1.mant. + x_bits.set_biased_exponent(FPBits::EXP_BIAS); + float mant_f = x_bits.get_val(); + // v = 1.mant * 1/f - 1 = d/f + float v = fputil::multiply_add(mant_f, ONE_OVER_F_F[f], -1.0f); + + // Degree-3 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax(log10(1 + x)/x, 2, [|SG...|], [-2^-5, 2^-5]); + // > x * P; + float log10p1_d_over_f = + v * fputil::polyeval(v, 0x1.bcb7bp-2f, -0x1.bce168p-3f, 0x1.28acb8p-3f); + // log10(1.mant) = log10(f) + log10(1 + d/f) + float log10_1_mant = LOG10F_F[f] + log10p1_d_over_f; + return fputil::cast( + fputil::multiply_add(static_cast(m), LOG10F_2, log10_1_mant)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/log10f16.h b/libc/src/math/log10f16.h new file mode 100644 index 000000000000..298deb370e0b --- /dev/null +++ b/libc/src/math/log10f16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for log10f16 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_LOG10F16_H +#define LLVM_LIBC_SRC_MATH_LOG10F16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 log10f16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_LOG10F16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 24a5abec898a..befa8cff0720 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1846,6 +1846,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + log10f16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + log10f16_test.cpp + DEPENDS + libc.src.math.log10f16 +) + add_fp_unittest( log1p_test NEED_MPFR diff --git a/libc/test/src/math/log10f16_test.cpp b/libc/test/src/math/log10f16_test.cpp new file mode 100644 index 000000000000..a71e3309ac5f --- /dev/null +++ b/libc/test/src/math/log10f16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for log10f16 --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/log10f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0]; +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcLog10f16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, + LIBC_NAMESPACE::log10f16(x), 0.5); + } +} + +TEST_F(LlvmLibcLog10f16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x, + LIBC_NAMESPACE::log10f16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 3c077240356b..d41041c9bb0a 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3632,6 +3632,19 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + log10f16_test + SUITE + libc-math-smoke-tests + SRCS + log10f16_test.cpp + DEPENDS + libc.hdr.fenv_macros + libc.src.errno.errno + libc.src.math.log10f16 + libc.src.__support.FPUtil.cast +) + add_fp_unittest( log1p_test SUITE diff --git a/libc/test/src/math/smoke/log10f16_test.cpp b/libc/test/src/math/smoke/log10f16_test.cpp new file mode 100644 index 000000000000..471e19893332 --- /dev/null +++ b/libc/test/src/math/smoke/log10f16_test.cpp @@ -0,0 +1,50 @@ +//===-- Unittests for log10f16 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/cast.h" +#include "src/errno/libc_errno.h" +#include "src/math/log10f16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcLog10f16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log10f16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::log10f16(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::log10f16(zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING( + neg_inf, LIBC_NAMESPACE::log10f16(neg_zero), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + zero, + LIBC_NAMESPACE::log10f16(LIBC_NAMESPACE::fputil::cast(1.0))); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING( + aNaN, + LIBC_NAMESPACE::log10f16(LIBC_NAMESPACE::fputil::cast(-1.0))); + EXPECT_MATH_ERRNO(EDOM); +} -- GitLab From d97f6d1ae90e7c95be17c9cb7821ad94fe4587fe Mon Sep 17 00:00:00 2001 From: OverMighty Date: Sat, 19 Oct 2024 01:41:52 +0200 Subject: [PATCH 133/511] [libc][math][c23] Add sqrtf16 C23 math function (#112406) Part of #95250. --- libc/config/gpu/entrypoints.txt | 1 + libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/math/index.rst | 2 +- libc/spec/stdc.td | 1 + libc/src/__support/FPUtil/generic/sqrt.h | 3 ++- libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 12 ++++++++++ libc/src/math/generic/sqrtf16.cpp | 20 ++++++++++++++++ libc/src/math/sqrtf16.h | 21 +++++++++++++++++ libc/test/src/math/CMakeLists.txt | 11 +++++++++ libc/test/src/math/smoke/CMakeLists.txt | 12 ++++++++++ libc/test/src/math/smoke/sqrtf16_test.cpp | 13 +++++++++++ libc/test/src/math/sqrtf16_test.cpp | 28 +++++++++++++++++++++++ 14 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 libc/src/math/generic/sqrtf16.cpp create mode 100644 libc/src/math/sqrtf16.h create mode 100644 libc/test/src/math/smoke/sqrtf16_test.cpp create mode 100644 libc/test/src/math/sqrtf16_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 13bb88894297..38e9f2e685ca 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -590,6 +590,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 libc.src.math.sinhf16 + libc.src.math.sqrtf16 libc.src.math.tanhf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 885827d304ef..71c6e874429f 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -680,6 +680,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.setpayloadf16 libc.src.math.setpayloadsigf16 libc.src.math.sinpif16 + libc.src.math.sqrtf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 libc.src.math.truncf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 6ed6ad8c2400..9bc63edf06f2 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -684,6 +684,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.setpayloadsigf16 libc.src.math.sinhf16 libc.src.math.sinpif16 + libc.src.math.sqrtf16 libc.src.math.tanhf16 libc.src.math.totalorderf16 libc.src.math.totalordermagf16 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 88751e2453f2..ce4df92393ce 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -344,7 +344,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | sinpi | |check| | | | |check| | | 7.12.4.13 | F.10.1.13 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| sqrt | |check| | |check| | |check| | | |check| | 7.12.7.10 | F.10.4.10 | +| sqrt | |check| | |check| | |check| | |check| | |check| | 7.12.7.10 | F.10.4.10 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | tan | |check| | |check| | | | | 7.12.4.7 | F.10.1.7 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 33bec2f71627..d1ebc6ffb582 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -754,6 +754,7 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"sqrt", RetValSpec, [ArgSpec]>, FunctionSpec<"sqrtf", RetValSpec, [ArgSpec]>, FunctionSpec<"sqrtl", RetValSpec, [ArgSpec]>, + GuardedFunctionSpec<"sqrtf16", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"sqrtf128", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"trunc", RetValSpec, [ArgSpec]>, diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h index 01af4bb7c900..497ebd145c6b 100644 --- a/libc/src/__support/FPUtil/generic/sqrt.h +++ b/libc/src/__support/FPUtil/generic/sqrt.h @@ -139,7 +139,8 @@ sqrt(InType x) { for (InStorageType current_bit = ONE >> 1; current_bit; current_bit >>= 1) { r <<= 1; - InStorageType tmp = (y << 1) + current_bit; // 2*y(n - 1) + 2^(-n-1) + // 2*y(n - 1) + 2^(-n-1) + InStorageType tmp = static_cast((y << 1) + current_bit); if (r >= tmp) { r -= tmp; y += current_bit; diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9239f029abf8..cb4817348cbb 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -492,6 +492,7 @@ add_math_entrypoint_object(sinhf16) add_math_entrypoint_object(sqrt) add_math_entrypoint_object(sqrtf) add_math_entrypoint_object(sqrtl) +add_math_entrypoint_object(sqrtf16) add_math_entrypoint_object(sqrtf128) add_math_entrypoint_object(tan) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 00cad6f85b4b..35e7347b9136 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -3249,6 +3249,18 @@ add_entrypoint_object( -O3 ) +add_entrypoint_object( + sqrtf16 + SRCS + sqrtf16.cpp + HDRS + ../sqrtf16.h + DEPENDS + libc.src.__support.FPUtil.sqrt + COMPILE_OPTIONS + -O3 +) + add_entrypoint_object( sqrtf128 SRCS diff --git a/libc/src/math/generic/sqrtf16.cpp b/libc/src/math/generic/sqrtf16.cpp new file mode 100644 index 000000000000..0aa4a201b3e6 --- /dev/null +++ b/libc/src/math/generic/sqrtf16.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of sqrtf16 function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/sqrtf16.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, sqrtf16, (float16 x)) { + return fputil::sqrt(x); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/sqrtf16.h b/libc/src/math/sqrtf16.h new file mode 100644 index 000000000000..bb09c4fdaf8d --- /dev/null +++ b/libc/src/math/sqrtf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for sqrtf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_SQRTF16_H +#define LLVM_LIBC_SRC_MATH_SQRTF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 sqrtf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_SQRTF16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index befa8cff0720..262c717dd27d 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1490,6 +1490,17 @@ add_fp_unittest( libc.src.math.sqrtl ) +add_fp_unittest( + sqrtf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + sqrtf16_test.cpp + DEPENDS + libc.src.math.sqrtf16 +) + add_fp_unittest( generic_sqrtf_test NEED_MPFR diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index d41041c9bb0a..b2d1871541ef 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -2889,6 +2889,18 @@ add_fp_unittest( libc.src.math.sqrtl ) +add_fp_unittest( + sqrtf16_test + SUITE + libc-math-smoke-tests + SRCS + sqrtf16_test.cpp + HDRS + SqrtTest.h + DEPENDS + libc.src.math.sqrtf16 +) + add_fp_unittest( sqrtf128_test SUITE diff --git a/libc/test/src/math/smoke/sqrtf16_test.cpp b/libc/test/src/math/smoke/sqrtf16_test.cpp new file mode 100644 index 000000000000..d62049661eec --- /dev/null +++ b/libc/test/src/math/smoke/sqrtf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for sqrtf16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/math/sqrtf16.h" + +LIST_SQRT_TESTS(float16, LIBC_NAMESPACE::sqrtf16) diff --git a/libc/test/src/math/sqrtf16_test.cpp b/libc/test/src/math/sqrtf16_test.cpp new file mode 100644 index 000000000000..f6e899676124 --- /dev/null +++ b/libc/test/src/math/sqrtf16_test.cpp @@ -0,0 +1,28 @@ +//===-- Exhaustive test for sqrtf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/sqrtf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcSqrtf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf]; +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +TEST_F(LlvmLibcSqrtf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sqrt, x, + LIBC_NAMESPACE::sqrtf16(x), 0.5); + } +} -- GitLab From 2b7e9d27817da54c34a6f02dc00d2466c31f6fa0 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 18 Oct 2024 16:54:55 -0700 Subject: [PATCH 134/511] [lldb] Add missing whitespace in help text --- lldb/source/Interpreter/CommandInterpreter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index bfac3f4fea8d..c990972ca64b 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -839,7 +839,7 @@ void CommandInterpreter::LoadCommandDictionary() { "argument displays at most that many frames. The argument 'all' " "displays all threads. Use 'settings set frame-format' to customize " "the printing of individual frames and 'settings set thread-format' " - "to customize the thread header. Frame recognizers may filter the" + "to customize the thread header. Frame recognizers may filter the " "list. Use 'thread backtrace -u (--unfiltered)' to see them all.", "bt [ | all]", 0, false)); if (bt_regex_cmd_up) { -- GitLab From f7b6dc821ad2aa02e027db76f193b85a87443e0b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 18 Oct 2024 19:16:42 -0500 Subject: [PATCH 135/511] [Clang] Fix missing `-` in argument to nvlinker --- clang/lib/Driver/ToolChains/Cuda.cpp | 2 +- clang/test/Driver/cuda-cross-compiling.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index e9d2e3fe6d5c..412b379304b1 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -635,7 +635,7 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, getNVPTXTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args, Features); CmdArgs.push_back( - Args.MakeArgString("--plugin-opt=mattr=" + llvm::join(Features, ","))); + Args.MakeArgString("--plugin-opt=-mattr=" + llvm::join(Features, ","))); // Add paths for the default clang library path. SmallString<256> DefaultLibPath = diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index 54c291fac66f..126e9e9fc83d 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -104,4 +104,4 @@ // RUN: %clang -target nvptx64-nvidia-cuda --cuda-feature=+ptx63 -march=sm_52 -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=FEATURE %s -// FEATURE: clang-nvlink-wrapper{{.*}}"--plugin-opt=mattr=+ptx63" +// FEATURE: clang-nvlink-wrapper{{.*}}"--plugin-opt=-mattr=+ptx63" -- GitLab From 864902e9b4d8bc6d3f0852d5c475e3dc97dd8335 Mon Sep 17 00:00:00 2001 From: Renaud Kauffmann Date: Fri, 18 Oct 2024 17:35:38 -0700 Subject: [PATCH 136/511] [flang][cuda] Call CUFGetDeviceAddress to get global device address from host address (#112989) --- .../Optimizer/Transforms/CufOpConversion.h | 2 + .../Optimizer/Transforms/CufOpConversion.cpp | 96 ++++++++++++++++--- flang/test/Fir/CUDA/cuda-data-transfer.fir | 43 +++++++++ 3 files changed, 126 insertions(+), 15 deletions(-) diff --git a/flang/include/flang/Optimizer/Transforms/CufOpConversion.h b/flang/include/flang/Optimizer/Transforms/CufOpConversion.h index 79ce4ac5c6cb..0a71cdfddec1 100644 --- a/flang/include/flang/Optimizer/Transforms/CufOpConversion.h +++ b/flang/include/flang/Optimizer/Transforms/CufOpConversion.h @@ -18,12 +18,14 @@ class LLVMTypeConverter; namespace mlir { class DataLayout; +class SymbolTable; } namespace cuf { void populateCUFToFIRConversionPatterns(const fir::LLVMTypeConverter &converter, mlir::DataLayout &dl, + const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns); } // namespace cuf diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp index 91ef1259332d..9df559ee0ab1 100644 --- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp @@ -77,6 +77,69 @@ static bool hasDoubleDescriptors(OpTy op) { return false; } +static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter, + mlir::Location loc, mlir::Type toTy, + mlir::Value val) { + if (val.getType() != toTy) + return rewriter.create(loc, toTy, val); + return val; +} + +mlir::Value getDeviceAddress(mlir::PatternRewriter &rewriter, + mlir::OpOperand &operand, + const mlir::SymbolTable &symtab) { + mlir::Value v = operand.get(); + auto declareOp = v.getDefiningOp(); + if (!declareOp) + return v; + + auto addrOfOp = declareOp.getMemref().getDefiningOp(); + if (!addrOfOp) + return v; + + auto globalOp = symtab.lookup( + addrOfOp.getSymbol().getRootReference().getValue()); + + if (!globalOp) + return v; + + bool isDevGlobal{false}; + auto attr = globalOp.getDataAttrAttr(); + if (attr) { + switch (attr.getValue()) { + case cuf::DataAttribute::Device: + case cuf::DataAttribute::Managed: + case cuf::DataAttribute::Pinned: + isDevGlobal = true; + break; + default: + break; + } + } + if (!isDevGlobal) + return v; + mlir::OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(operand.getOwner()); + auto loc = declareOp.getLoc(); + auto mod = declareOp->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = callee.getFunctionType(); + auto toTy = fTy.getInput(0); + mlir::Value inputArg = + createConvertOp(rewriter, loc, toTy, declareOp.getResult()); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, inputArg, sourceFile, sourceLine)}; + auto call = rewriter.create(loc, callee, args); + + return call->getResult(0); +} + template static mlir::LogicalResult convertOpToCall(OpTy op, mlir::PatternRewriter &rewriter, @@ -363,18 +426,14 @@ struct CufFreeOpConversion : public mlir::OpRewritePattern { } }; -static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter, - mlir::Location loc, mlir::Type toTy, - mlir::Value val) { - if (val.getType() != toTy) - return rewriter.create(loc, toTy, val); - return val; -} - struct CufDataTransferOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; + CufDataTransferOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symtab) + : OpRewritePattern(context), symtab{symtab} {} + mlir::LogicalResult matchAndRewrite(cuf::DataTransferOp op, mlir::PatternRewriter &rewriter) const override { @@ -445,9 +504,11 @@ struct CufDataTransferOpConversion mlir::Value sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(5)); - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, op.getDst(), op.getSrc(), bytes, modeValue, - sourceFile, sourceLine)}; + mlir::Value dst = getDeviceAddress(rewriter, op.getDstMutable(), symtab); + mlir::Value src = getDeviceAddress(rewriter, op.getSrcMutable(), symtab); + llvm::SmallVector args{ + fir::runtime::createArguments(builder, loc, fTy, dst, src, bytes, + modeValue, sourceFile, sourceLine)}; builder.create(loc, func, args); rewriter.eraseOp(op); return mlir::success(); @@ -552,6 +613,9 @@ struct CufDataTransferOpConversion } return mlir::success(); } + +private: + const mlir::SymbolTable &symtab; }; class CufOpConversion : public fir::impl::CufOpConversionBase { @@ -565,13 +629,15 @@ public: mlir::ModuleOp module = mlir::dyn_cast(op); if (!module) return signalPassFailure(); + mlir::SymbolTable symtab(module); std::optional dl = fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false); fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl); target.addLegalDialect(); - cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, patterns); + cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab, + patterns); if (mlir::failed(mlir::applyPartialConversion(getOperation(), target, std::move(patterns)))) { mlir::emitError(mlir::UnknownLoc::get(ctx), @@ -584,9 +650,9 @@ public: void cuf::populateCUFToFIRConversionPatterns( const fir::LLVMTypeConverter &converter, mlir::DataLayout &dl, - mlir::RewritePatternSet &patterns) { + const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { patterns.insert(patterns.getContext(), &dl, &converter); patterns.insert( - patterns.getContext()); + CufFreeOpConversion>(patterns.getContext()); + patterns.insert(patterns.getContext(), symtab); } diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index ed894aed5534..c33c50115b9f 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -189,4 +189,47 @@ func.func @_QPsub7() { // CHECK: %[[SRC:.*]] = fir.convert %[[IHOST]]#0 : (!fir.ref>) -> !fir.llvm_ptr // CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none +fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda} : !fir.array<5xi32> +func.func @_QPsub8() attributes {fir.bindc_name = "t"} { + %c5 = arith.constant 5 : index + %0 = fir.alloca !fir.array<5xi32> {bindc_name = "m", uniq_name = "_QFEm"} + %1 = fir.shape %c5 : (index) -> !fir.shape<1> + %2 = fir.declare %0(%1) {uniq_name = "_QFEm"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + %3 = fir.address_of(@_QMmtestsEn) : !fir.ref> + %4 = fir.declare %3(%1) {data_attr = #cuf.cuda, uniq_name = "_QMmtestsEn"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + cuf.data_transfer %4 to %2 {transfer_kind = #cuf.cuda_transfer} : !fir.ref>, !fir.ref> + return +} + +// CHECK-LABEL: func.func @_QPsub8() +// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> +// CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> +// CHECK: %[[DECL:.*]] = fir.declare %[[GBL]] +// CHECK: %[[HOST:.*]] = fir.convert %[[DECL]] : (!fir.ref>) -> !fir.llvm_ptr +// CHECK: %[[SRC:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[HOST]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr +// CHECK: %[[DST:.*]] = fir.convert %[[LOCAL]] : (!fir.ref>) -> !fir.llvm_ptr +// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none + + +func.func @_QPsub9() { + %c5 = arith.constant 5 : index + %0 = fir.alloca !fir.array<5xi32> {bindc_name = "m", uniq_name = "_QFtest9Em"} + %1 = fir.shape %c5 : (index) -> !fir.shape<1> + %2 = fir.declare %0(%1) {uniq_name = "_QFtest9Em"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + %3 = fir.address_of(@_QMmtestsEn) : !fir.ref> + %4 = fir.declare %3(%1) {data_attr = #cuf.cuda, uniq_name = "_QMmtestsEn"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + cuf.data_transfer %2 to %4 {transfer_kind = #cuf.cuda_transfer} : !fir.ref>, !fir.ref> + return +} + +// CHECK-LABEL: func.func @_QPsub9() +// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> +// CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> +// CHECK: %[[DECL:.*]] = fir.declare %[[GBL]] +// CHECK: %[[HOST:.*]] = fir.convert %[[DECL]] : (!fir.ref>) -> !fir.llvm_ptr +// CHECK: %[[DST:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[HOST]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr +// CHECK: %[[SRC:.*]] = fir.convert %[[LOCAL]] : (!fir.ref>) -> !fir.llvm_ptr +// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[DST]], %[[SRC]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none } // end of module -- GitLab From 1bc1a79a65a93a0224b5e5f69584219f9981bd23 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 18 Oct 2024 18:17:23 -0700 Subject: [PATCH 137/511] [RISCV] Support inline assembly 'f' constraint for Zfinx. (#112986) This would allow some inline assembly code to work with either F or Zfinx. This appears to match gcc behavior. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 48 ++++++++--- .../RISCV/inline-asm-zdinx-constraint-r.ll | 48 +++++++++++ .../RISCV/inline-asm-zfinx-constraint-r.ll | 45 ++++++++++ .../RISCV/inline-asm-zhinx-constraint-r.ll | 82 +++++++++++++++++++ 4 files changed, 211 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fbd2f47d2769..3588ef46cadc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20392,12 +20392,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); return std::make_pair(0U, &RISCV::GPRNoX0RegClass); case 'f': - if (Subtarget.hasStdExtZfhmin() && VT == MVT::f16) - return std::make_pair(0U, &RISCV::FPR16RegClass); - if (Subtarget.hasStdExtF() && VT == MVT::f32) - return std::make_pair(0U, &RISCV::FPR32RegClass); - if (Subtarget.hasStdExtD() && VT == MVT::f64) - return std::make_pair(0U, &RISCV::FPR64RegClass); + if (VT == MVT::f16) { + if (Subtarget.hasStdExtZfhmin()) + return std::make_pair(0U, &RISCV::FPR16RegClass); + if (Subtarget.hasStdExtZhinxmin()) + return std::make_pair(0U, &RISCV::GPRF16NoX0RegClass); + } else if (VT == MVT::f32) { + if (Subtarget.hasStdExtF()) + return std::make_pair(0U, &RISCV::FPR32RegClass); + if (Subtarget.hasStdExtZfinx()) + return std::make_pair(0U, &RISCV::GPRF32NoX0RegClass); + } else if (VT == MVT::f64) { + if (Subtarget.hasStdExtD()) + return std::make_pair(0U, &RISCV::FPR64RegClass); + if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) + return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); + if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit()) + return std::make_pair(0U, &RISCV::GPRNoX0RegClass); + } break; default: break; @@ -20440,12 +20452,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (!VT.isVector()) return std::make_pair(0U, &RISCV::GPRCRegClass); } else if (Constraint == "cf") { - if (Subtarget.hasStdExtZfhmin() && VT == MVT::f16) - return std::make_pair(0U, &RISCV::FPR16CRegClass); - if (Subtarget.hasStdExtF() && VT == MVT::f32) - return std::make_pair(0U, &RISCV::FPR32CRegClass); - if (Subtarget.hasStdExtD() && VT == MVT::f64) - return std::make_pair(0U, &RISCV::FPR64CRegClass); + if (VT == MVT::f16) { + if (Subtarget.hasStdExtZfhmin()) + return std::make_pair(0U, &RISCV::FPR16CRegClass); + if (Subtarget.hasStdExtZhinxmin()) + return std::make_pair(0U, &RISCV::GPRF16CRegClass); + } else if (VT == MVT::f32) { + if (Subtarget.hasStdExtF()) + return std::make_pair(0U, &RISCV::FPR32CRegClass); + if (Subtarget.hasStdExtZfinx()) + return std::make_pair(0U, &RISCV::GPRF32CRegClass); + } else if (VT == MVT::f64) { + if (Subtarget.hasStdExtD()) + return std::make_pair(0U, &RISCV::FPR64CRegClass); + if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) + return std::make_pair(0U, &RISCV::GPRPairCRegClass); + if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit()) + return std::make_pair(0U, &RISCV::GPRCRegClass); + } } // Clang will correctly decode the usage of register name aliases into their diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll index 15729ee2bc61..57be0e5e4199 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zdinx-constraint-r.ll @@ -90,3 +90,51 @@ define double @constraint_double_abi_name(double %a) nounwind { %2 = tail call double asm "fadd.d $0, $1, $2", "={t1},{a0},{s0}"(double %a, double %1) ret double %2 } + +define double @constraint_f_double(double %a) nounwind { +; RV32FINX-LABEL: constraint_f_double: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a2, %hi(gd) +; RV32FINX-NEXT: lw a3, %lo(gd+4)(a2) +; RV32FINX-NEXT: lw a2, %lo(gd)(a2) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.d a0, a0, a2 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_f_double: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gd) +; RV64FINX-NEXT: ld a1, %lo(gd)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.d a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "=f,f,f"(double %a, double %1) + ret double %2 +} + +define double @constraint_cf_double(double %a) nounwind { +; RV32FINX-LABEL: constraint_cf_double: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a2, %hi(gd) +; RV32FINX-NEXT: lw a3, %lo(gd+4)(a2) +; RV32FINX-NEXT: lw a2, %lo(gd)(a2) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.d a0, a0, a2 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_cf_double: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gd) +; RV64FINX-NEXT: ld a1, %lo(gd)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.d a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load double, ptr @gd + %2 = tail call double asm "fadd.d $0, $1, $2", "=^cf,^cf,^cf"(double %a, double %1) + ret double %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll index a8d3515fe189..1c0de6c3f161 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll @@ -87,3 +87,48 @@ define float @constraint_float_abi_name(float %a) nounwind { ret float %2 } +define float @constraint_f_float(float %a) nounwind { +; RV32FINX-LABEL: constraint_f_float: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a1, %hi(gf) +; RV32FINX-NEXT: lw a1, %lo(gf)(a1) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.s a0, a0, a1 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_f_float: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gf) +; RV64FINX-NEXT: lw a1, %lo(gf)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.s a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "=f,f,f"(float %a, float %1) + ret float %2 +} + +define float @constraint_cf_float(float %a) nounwind { +; RV32FINX-LABEL: constraint_cf_float: +; RV32FINX: # %bb.0: +; RV32FINX-NEXT: lui a1, %hi(gf) +; RV32FINX-NEXT: lw a1, %lo(gf)(a1) +; RV32FINX-NEXT: #APP +; RV32FINX-NEXT: fadd.s a0, a0, a1 +; RV32FINX-NEXT: #NO_APP +; RV32FINX-NEXT: ret +; +; RV64FINX-LABEL: constraint_cf_float: +; RV64FINX: # %bb.0: +; RV64FINX-NEXT: lui a1, %hi(gf) +; RV64FINX-NEXT: lw a1, %lo(gf)(a1) +; RV64FINX-NEXT: #APP +; RV64FINX-NEXT: fadd.s a0, a0, a1 +; RV64FINX-NEXT: #NO_APP +; RV64FINX-NEXT: ret + %1 = load float, ptr @gf + %2 = tail call float asm "fadd.s $0, $1, $2", "=^cf,cf,cf"(float %a, float %1) + ret float %2 +} diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll index f9707c6c8995..086d2a1d6f3b 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll @@ -156,3 +156,85 @@ define half @constraint_half_abi_name(half %a) nounwind { %2 = tail call half asm "fadd.s $0, $1, $2", "={t0},{a0},{s0}"(half %a, half %1) ret half %2 } + +define half @constraint_f_half(half %a) nounwind { +; RV32ZHINX-LABEL: constraint_f_half: +; RV32ZHINX: # %bb.0: +; RV32ZHINX-NEXT: lui a1, %hi(gh) +; RV32ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32ZHINX-NEXT: #APP +; RV32ZHINX-NEXT: fadd.h a0, a0, a1 +; RV32ZHINX-NEXT: #NO_APP +; RV32ZHINX-NEXT: ret +; +; RV64ZHINX-LABEL: constraint_f_half: +; RV64ZHINX: # %bb.0: +; RV64ZHINX-NEXT: lui a1, %hi(gh) +; RV64ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64ZHINX-NEXT: #APP +; RV64ZHINX-NEXT: fadd.h a0, a0, a1 +; RV64ZHINX-NEXT: #NO_APP +; RV64ZHINX-NEXT: ret +; +; RV32DINXZHINX-LABEL: constraint_f_half: +; RV32DINXZHINX: # %bb.0: +; RV32DINXZHINX-NEXT: lui a1, %hi(gh) +; RV32DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32DINXZHINX-NEXT: #APP +; RV32DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV32DINXZHINX-NEXT: #NO_APP +; RV32DINXZHINX-NEXT: ret +; +; RV64DINXZHINX-LABEL: constraint_f_half: +; RV64DINXZHINX: # %bb.0: +; RV64DINXZHINX-NEXT: lui a1, %hi(gh) +; RV64DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64DINXZHINX-NEXT: #APP +; RV64DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV64DINXZHINX-NEXT: #NO_APP +; RV64DINXZHINX-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.h $0, $1, $2", "=f,f,f"(half %a, half %1) + ret half %2 +} + +define half @constraint_cf_half(half %a) nounwind { +; RV32ZHINX-LABEL: constraint_cf_half: +; RV32ZHINX: # %bb.0: +; RV32ZHINX-NEXT: lui a1, %hi(gh) +; RV32ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32ZHINX-NEXT: #APP +; RV32ZHINX-NEXT: fadd.h a0, a0, a1 +; RV32ZHINX-NEXT: #NO_APP +; RV32ZHINX-NEXT: ret +; +; RV64ZHINX-LABEL: constraint_cf_half: +; RV64ZHINX: # %bb.0: +; RV64ZHINX-NEXT: lui a1, %hi(gh) +; RV64ZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64ZHINX-NEXT: #APP +; RV64ZHINX-NEXT: fadd.h a0, a0, a1 +; RV64ZHINX-NEXT: #NO_APP +; RV64ZHINX-NEXT: ret +; +; RV32DINXZHINX-LABEL: constraint_cf_half: +; RV32DINXZHINX: # %bb.0: +; RV32DINXZHINX-NEXT: lui a1, %hi(gh) +; RV32DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV32DINXZHINX-NEXT: #APP +; RV32DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV32DINXZHINX-NEXT: #NO_APP +; RV32DINXZHINX-NEXT: ret +; +; RV64DINXZHINX-LABEL: constraint_cf_half: +; RV64DINXZHINX: # %bb.0: +; RV64DINXZHINX-NEXT: lui a1, %hi(gh) +; RV64DINXZHINX-NEXT: lh a1, %lo(gh)(a1) +; RV64DINXZHINX-NEXT: #APP +; RV64DINXZHINX-NEXT: fadd.h a0, a0, a1 +; RV64DINXZHINX-NEXT: #NO_APP +; RV64DINXZHINX-NEXT: ret + %1 = load half, ptr @gh + %2 = tail call half asm "fadd.h $0, $1, $2", "=^cf,^cf,^cf"(half %a, half %1) + ret half %2 +} -- GitLab From 1784aca904718421452445a4d835af3cd3c3c89b Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 18 Oct 2024 18:34:37 -0700 Subject: [PATCH 138/511] gn build: Sync hwasan assembly file source list. --- .../secondary/compiler-rt/lib/hwasan/BUILD.gn | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn index a30c291e1567..e39d8114d1f4 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn @@ -57,14 +57,27 @@ source_set("sources") { "hwasan_poisoning.h", "hwasan_report.cpp", "hwasan_report.h", - "hwasan_setjmp_aarch64.S", - "hwasan_setjmp_riscv64.S", "hwasan_thread.cpp", "hwasan_thread.h", "hwasan_thread_list.cpp", "hwasan_thread_list.h", "hwasan_type_test.cpp", ] + if (current_cpu == "arm64") { + sources += [ + "hwasan_setjmp_aarch64.S", + "hwasan_tag_mismatch_aarch64.S", + ] + } + if (current_cpu == "riscv64") { + sources += [ + "hwasan_setjmp_riscv64.S", + "hwasan_tag_mismatch_riscv64.S", + ] + } + if (current_cpu == "x64") { + sources += [ "hwasan_setjmp_x86_64.S" ] + } } source_set("cxx_sources") { -- GitLab From 561f9155fb8beea15e1824ea966f934477f05fa6 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 19 Oct 2024 01:37:17 +0000 Subject: [PATCH 139/511] [gn build] Port b515d9ea1e43 --- .../utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn index e39d8114d1f4..f453dde0ea93 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn @@ -64,16 +64,10 @@ source_set("sources") { "hwasan_type_test.cpp", ] if (current_cpu == "arm64") { - sources += [ - "hwasan_setjmp_aarch64.S", - "hwasan_tag_mismatch_aarch64.S", - ] + sources += [ "hwasan_setjmp_aarch64.S" ] } if (current_cpu == "riscv64") { - sources += [ - "hwasan_setjmp_riscv64.S", - "hwasan_tag_mismatch_riscv64.S", - ] + sources += [ "hwasan_setjmp_riscv64.S" ] } if (current_cpu == "x64") { sources += [ "hwasan_setjmp_x86_64.S" ] -- GitLab From b5fa4fee46c1d0046cc395e3338ae13fe6e2cb84 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 18 Oct 2024 18:40:29 -0700 Subject: [PATCH 140/511] [lsan] Fix compilation on Android (#113003) --- compiler-rt/lib/lsan/lsan_common.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 25d79544b2f3..5c44c000ae57 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -521,13 +521,14 @@ static void ProcessThread(tid_t os_id, uptr sp, } } # if SANITIZER_ANDROID + extra_ranges.clear(); auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/, void *arg) -> void { - ScanForPointers( - reinterpret_cast(dtls_begin), reinterpret_cast(dtls_end), - reinterpret_cast(arg), "DTLS", kReachable, accessor); + reinterpret_cast *>(arg)->push_back( + {reinterpret_cast(dtls_begin), + reinterpret_cast(dtls_end)}); }; - + ScanRanges(extra_ranges, frontier, "DTLS", accessor); // FIXME: There might be a race-condition here (and in Bionic) if the // thread is suspended in the middle of updating its DTLS. IOWs, we // could scan already freed memory. (probably fine for now) -- GitLab From 2a6b09e0d3d3c1a05d3d5165202a6e68900974b1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 18 Oct 2024 19:12:39 -0700 Subject: [PATCH 141/511] [LV] Use type from InsertPos for cost computation of interleave groups. Previously the legacy cost model would pick the type for the cost computation depending on the order of the members in the input IR. This is incompatible with the VPlan-based cost model (independent of original IR order) and also doesn't match code-gen, which uses the type of the insert position. Update the legacy cost model to use the type (and address space) from the Group's insert position. This brings the legacy cost model in line with the legacy cost model and fixes a divergence between both models. Note that the X86 cost model seems to assign different costs to groups with i64 and double types. Added a TODO to check. Fixes https://github.com/llvm/llvm-project/issues/112922. --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +- .../LoopVectorize/X86/interleave-cost.ll | 182 ++++++++++++++++++ 3 files changed, 205 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 857efbdf687c..ce0903b838aa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5738,14 +5738,15 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(ToVectorTy(ValTy, VF)); - unsigned AS = getLoadStoreAddressSpace(I); - enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - const auto *Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); + Instruction *InsertPos = Group->getInsertPos(); + Type *ValTy = getLoadStoreType(InsertPos); + auto *VectorTy = cast(ToVectorTy(ValTy, VF)); + unsigned AS = getLoadStoreAddressSpace(InsertPos); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + unsigned InterleaveFactor = Group->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); @@ -5760,8 +5761,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), + UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f4a1f58debba..41f13cc2d9a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2958,11 +2958,20 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - Instruction *I = getInsertPos(); + Instruction *InsertPos = getInsertPos(); + // Find the VPValue index of the interleave group. We need to skip gaps. + unsigned InsertPosIdx = 0; + for (unsigned Idx = 0; IG->getFactor(); ++Idx) + if (auto *Member = IG->getMember(Idx)) { + if (Member == InsertPos) + break; + InsertPosIdx++; + } Type *ValTy = Ctx.Types.inferScalarType( - getNumDefinedValues() > 0 ? getVPValue(0) : getStoredValues()[0]); + getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) + : getStoredValues()[InsertPosIdx]); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); - unsigned AS = getLoadStoreAddressSpace(I); + unsigned AS = getLoadStoreAddressSpace(InsertPos); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned InterleaveFactor = IG->getFactor(); @@ -2976,8 +2985,8 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, // Calculate the cost of the whole interleaved group. InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS, - CostKind, getMask(), NeedsMaskForGaps); + InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, + IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps); if (!IG->isReverse()) return Cost; diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 7d1d326641e1..ad0068dc3f6b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -586,6 +586,184 @@ exit: ret void } +; Test case for https://github.com/llvm/llvm-project/issues/112922. +define void @interleave_store_double_i64(ptr %dst) { +; CHECK-LABEL: define void @interleave_store_double_i64( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[VEC_IND]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]], i32 1 +; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_1]], align 8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.1 = getelementptr { double, i64 }, ptr %dst, i64 %iv, i32 1 + store i64 %iv, ptr %gep.1, align 8 + %gep.0 = getelementptr { double, i64 }, ptr %dst, i64 %iv + store double 0.000000e+00, ptr %gep.0, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @interleave_store_i64_double(ptr %dst) { +; CHECK-LABEL: define void @interleave_store_i64_double( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]], i32 1 +; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_1]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.0 = getelementptr { double, i64 }, ptr %dst, i64 %iv + store double 0.000000e+00, ptr %gep.0, align 8 + %gep.1 = getelementptr { double, i64 }, ptr %dst, i64 %iv, i32 1 + store i64 %iv, ptr %gep.1, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; TODO: The interleave group should likely have the same cost as @interleave_store_double_i64. +define void @interleave_store_double_i64_2(ptr %dst) { +; CHECK-LABEL: define void @interleave_store_double_i64_2( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]], i32 1 +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.1 = getelementptr { i64, double }, ptr %dst, i64 %iv, i32 1 + store double 0.000000e+00, ptr %gep.1, align 8 + %gep.0 = getelementptr { i64, double }, ptr %dst, i64 %iv + store i64 %iv, ptr %gep.0, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @interleave_store_i64_double_2(ptr %dst) { +; CHECK-LABEL: define void @interleave_store_i64_double_2( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[VEC_IND]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]], i32 1 +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.0 = getelementptr { i64, double }, ptr %dst, i64 %iv + store i64 %iv, ptr %gep.0, align 8 + %gep.1 = getelementptr { i64, double }, ptr %dst, i64 %iv, i32 1 + store double 0.000000e+00, ptr %gep.1, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + + attributes #0 = { "target-features"="+sse4.2" } attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } @@ -601,4 +779,8 @@ attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } ; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} ;. -- GitLab From 0a6def62c2807d213e2b80f23c4a14cb9302f3fd Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Sat, 19 Oct 2024 10:17:44 +0800 Subject: [PATCH 142/511] [mlir][doc] Emit `\n` if description not end with `\n`. (#112898) This PR addresses a markdown formatting issue by ensuring a `\n` is emitted if the description string does not already end with one. Fixes #112672. --- mlir/tools/mlir-tblgen/OpDocGen.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index ff3c6b16bb6e..d499c78a5cf4 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -70,7 +70,10 @@ void mlir::tblgen::emitSummary(StringRef summary, raw_ostream &os) { // nested in the op definition. void mlir::tblgen::emitDescription(StringRef description, raw_ostream &os) { raw_indented_ostream ros(os); - ros.printReindented(description.rtrim(" \t")); + StringRef trimmed = description.rtrim(" \t"); + ros.printReindented(trimmed); + if (!trimmed.ends_with("\n")) + ros << "\n"; } void mlir::tblgen::emitDescriptionComment(StringRef description, -- GitLab From cf4442e6b10280a90982a161b91319ebd1235718 Mon Sep 17 00:00:00 2001 From: Job Henandez Lara Date: Fri, 18 Oct 2024 19:41:22 -0700 Subject: [PATCH 143/511] [libc] temporaliy disable __USE_EXTERN_INLINES and set __USE_FORITFY_LEVEL to 1 before including in overlay mode (#113012) --- libc/hdr/stdio_overlay.h | 22 ++++++++++++++++++++++ libc/hdr/wchar_overlay.h | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/libc/hdr/stdio_overlay.h b/libc/hdr/stdio_overlay.h index cec55abfde7b..aef8c448fe49 100644 --- a/libc/hdr/stdio_overlay.h +++ b/libc/hdr/stdio_overlay.h @@ -27,6 +27,17 @@ #undef _FORTIFY_SOURCE #endif +#ifdef __USE_EXTERN_INLINES +#define LIBC_OLD_USE_EXTERN_INLINES +#undef __USE_EXTERN_INLINES +#endif + +#ifdef __USE_FORTIFY_LEVEL +#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL 0 +#endif + #ifndef __NO_INLINE__ #define __NO_INLINE__ 1 #define LIBC_SET_NO_INLINE @@ -44,4 +55,15 @@ #undef LIBC_SET_NO_INLINE #endif +#ifdef LIBC_OLD_USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL +#undef LIBC_OLD_USE_FORTIFY_LEVEL +#endif + +#ifdef LIBC_OLD_USE_EXTERN_INLINES +#define __USE_EXTERN_INLINES +#undef LIBC_OLD_USE_EXTERN_INLINES +#endif + #endif // LLVM_LIBC_HDR_STDIO_OVERLAY_H diff --git a/libc/hdr/wchar_overlay.h b/libc/hdr/wchar_overlay.h index a1de9d5085d4..99a70899779e 100644 --- a/libc/hdr/wchar_overlay.h +++ b/libc/hdr/wchar_overlay.h @@ -32,6 +32,17 @@ #define LIBC_SET_NO_INLINE #endif +#ifdef __USE_EXTERN_INLINES +#define LIBC_OLD_USE_EXTERN_INLINES +#undef __USE_EXTERN_INLINES +#endif + +#ifdef __USE_FORTIFY_LEVEL +#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL 0 +#endif + #include #ifdef LIBC_OLD_FORTIFY_SOURCE @@ -44,4 +55,15 @@ #undef LIBC_SET_NO_INLINE #endif +#ifdef LIBC_OLD_USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL +#undef LIBC_OLD_USE_FORTIFY_LEVEL +#endif + +#ifdef LIBC_OLD_USE_EXTERN_INLINES +#define __USE_EXTERN_INLINES +#undef LIBC_OLD_USE_EXTERN_INLINES +#endif + #endif // LLVM_LIBC_HDR_WCHAR_OVERLAY_H -- GitLab From 85df28180bd38d3fd5356efe6022eebec31e0814 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 18 Oct 2024 21:10:00 -0700 Subject: [PATCH 144/511] [clang-format] Fix a bug that always returns error for JSON (#112839) Fixes #108556. --- clang/test/Format/dry-run-warning.cpp | 22 ++++++++++++++++++++++ clang/tools/clang-format/ClangFormat.cpp | 15 ++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 clang/test/Format/dry-run-warning.cpp diff --git a/clang/test/Format/dry-run-warning.cpp b/clang/test/Format/dry-run-warning.cpp new file mode 100644 index 000000000000..4b85de40b8cd --- /dev/null +++ b/clang/test/Format/dry-run-warning.cpp @@ -0,0 +1,22 @@ +// RUN: echo '{' > %t.json +// RUN: echo ' "married": true' >> %t.json +// RUN: echo '}' >> %t.json + +// RUN: clang-format -n -style=LLVM %t.json 2>&1 | FileCheck %s -allow-empty + +// RUN: clang-format -n -style=LLVM < %t.json 2>&1 \ +// RUN: | FileCheck %s -check-prefix=CHECK2 -strict-whitespace + +// RUN: echo '{' > %t.json +// RUN: echo ' "married" : true' >> %t.json +// RUN: echo '}' >> %t.json + +// RUN: clang-format -n -style=LLVM < %t.json 2>&1 | FileCheck %s -allow-empty + +// RUN: clang-format -n -style=LLVM %t.json 2>&1 \ +// RUN: | FileCheck %s -check-prefix=CHECK2 -strict-whitespace + +// RUN: rm %t.json + +// CHECK-NOT: warning +// CHECK2: warning: code should be clang-formatted diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp index 6aed46328f34..108db7204aa6 100644 --- a/clang/tools/clang-format/ClangFormat.cpp +++ b/clang/tools/clang-format/ClangFormat.cpp @@ -351,9 +351,6 @@ static void outputReplacementsXML(const Replacements &Replaces) { static bool emitReplacementWarnings(const Replacements &Replaces, StringRef AssumedFileName, const std::unique_ptr &Code) { - if (Replaces.empty()) - return false; - unsigned Errors = 0; if (WarnFormat && !NoWarnFormat) { SourceMgr Mgr; @@ -490,9 +487,11 @@ static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) { Replacements Replaces = sortIncludes(*FormatStyle, Code->getBuffer(), Ranges, AssumedFileName, &CursorPosition); + const bool IsJson = FormatStyle->isJson(); + // To format JSON insert a variable to trick the code into thinking its // JavaScript. - if (FormatStyle->isJson() && !FormatStyle->DisableFormat) { + if (IsJson && !FormatStyle->DisableFormat) { auto Err = Replaces.add(tooling::Replacement( tooling::Replacement(AssumedFileName, 0, 0, "x = "))); if (Err) @@ -510,9 +509,11 @@ static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) { Replacements FormatChanges = reformat(*FormatStyle, *ChangedCode, Ranges, AssumedFileName, &Status); Replaces = Replaces.merge(FormatChanges); - if (OutputXML || DryRun) { - if (DryRun) - return emitReplacementWarnings(Replaces, AssumedFileName, Code); + if (DryRun) { + return Replaces.size() > (IsJson ? 1 : 0) && + emitReplacementWarnings(Replaces, AssumedFileName, Code); + } + if (OutputXML) { outputXML(Replaces, FormatChanges, Status, Cursor, CursorPosition); } else { IntrusiveRefCntPtr InMemoryFileSystem( -- GitLab From 5406834cdaa6d26b98484d634df579606ae02229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 18 Oct 2024 21:30:38 -0700 Subject: [PATCH 145/511] [flang][cuda] Add cuf.register_module operation (#112971) Add a new operation to register the fatbin and pass it to `cuf.register_kernel` --- .../flang/Optimizer/Dialect/CUF/CUFOps.h | 1 + .../flang/Optimizer/Dialect/CUF/CUFOps.td | 20 +++++++++++++++++-- .../Transforms/CUFAddConstructor.cpp | 5 ++++- flang/test/Fir/CUDA/cuda-register-func.fir | 5 +++-- flang/test/Fir/cuf-invalid.fir | 15 +++++++++----- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h index 4132db672e39..1edded090f8c 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h @@ -12,6 +12,7 @@ #include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h" #include "flang/Optimizer/Dialect/CUF/CUFDialect.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/OpDefinition.h" #define GET_OP_CLASSES diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 98d1ef529738..d34a8af0394a 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -18,6 +18,7 @@ include "flang/Optimizer/Dialect/CUF/CUFDialect.td" include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td" include "flang/Optimizer/Dialect/FIRTypes.td" include "flang/Optimizer/Dialect/FIRAttr.td" +include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/LoopLikeInterface.td" include "mlir/IR/BuiltinAttributes.td" @@ -288,15 +289,30 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments, let hasVerifier = 1; } +def cuf_RegisterModuleOp : cuf_Op<"register_module", []> { + let summary = "Register a CUDA module"; + + let arguments = (ins + SymbolRefAttr:$name + ); + + let assemblyFormat = [{ + $name attr-dict `->` type($modulePtr) + }]; + + let results = (outs LLVM_AnyPointer:$modulePtr); +} + def cuf_RegisterKernelOp : cuf_Op<"register_kernel", []> { let summary = "Register a CUDA kernel"; let arguments = (ins - SymbolRefAttr:$name + SymbolRefAttr:$name, + LLVM_AnyPointer:$modulePtr ); let assemblyFormat = [{ - $name attr-dict + $name `(` $modulePtr `:` type($modulePtr) `)`attr-dict }]; let hasVerifier = 1; diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 3db24226e750..f260437e7104 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -62,12 +62,15 @@ struct CUFAddConstructor // Register kernels auto gpuMod = symTab.lookup(cudaModName); if (gpuMod) { + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx); + auto registeredMod = builder.create( + loc, llvmPtrTy, mlir::SymbolRefAttr::get(ctx, gpuMod.getName())); for (auto func : gpuMod.getOps()) { if (func.isKernel()) { auto kernelName = mlir::SymbolRefAttr::get( builder.getStringAttr(cudaModName), {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())}); - builder.create(loc, kernelName); + builder.create(loc, kernelName, registeredMod); } } } diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir index 277475f0883d..6b0cbfd3aca6 100644 --- a/flang/test/Fir/CUDA/cuda-register-func.fir +++ b/flang/test/Fir/CUDA/cuda-register-func.fir @@ -12,5 +12,6 @@ module attributes {gpu.container_module} { } // CHECK-LABEL: llvm.func internal @__cudaFortranConstructor() -// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device1 -// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device2 +// CHECK: %[[MOD_HANDLE:.*]] = cuf.register_module @cuda_device_mod -> !llvm.ptr +// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device1(%[[MOD_HANDLE]] : !llvm.ptr) +// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device2(%[[MOD_HANDLE]] : !llvm.ptr) diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index 8a1eb4857683..a3b9be3ee822 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -135,8 +135,9 @@ module attributes {gpu.container_module} { } } llvm.func internal @__cudaFortranConstructor() { + %0 = cuf.register_module @cuda_device_mod -> !llvm.ptr // expected-error@+1{{'cuf.register_kernel' op only kernel gpu.func can be registered}} - cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + cuf.register_kernel @cuda_device_mod::@_QPsub_device1(%0 : !llvm.ptr) llvm.return } } @@ -150,8 +151,9 @@ module attributes {gpu.container_module} { } } llvm.func internal @__cudaFortranConstructor() { + %0 = cuf.register_module @cuda_device_mod -> !llvm.ptr // expected-error@+1{{'cuf.register_kernel' op device function not found}} - cuf.register_kernel @cuda_device_mod::@_QPsub_device2 + cuf.register_kernel @cuda_device_mod::@_QPsub_device2(%0 : !llvm.ptr) llvm.return } } @@ -160,8 +162,9 @@ module attributes {gpu.container_module} { module attributes {gpu.container_module} { llvm.func internal @__cudaFortranConstructor() { + %0 = cuf.register_module @cuda_device_mod -> !llvm.ptr // expected-error@+1{{'cuf.register_kernel' op gpu module not found}} - cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + cuf.register_kernel @cuda_device_mod::@_QPsub_device1(%0 : !llvm.ptr) llvm.return } } @@ -170,8 +173,9 @@ module attributes {gpu.container_module} { module attributes {gpu.container_module} { llvm.func internal @__cudaFortranConstructor() { + %0 = cuf.register_module @cuda_device_mod -> !llvm.ptr // expected-error@+1{{'cuf.register_kernel' op expect a module and a kernel name}} - cuf.register_kernel @_QPsub_device1 + cuf.register_kernel @_QPsub_device1(%0 : !llvm.ptr) llvm.return } } @@ -185,8 +189,9 @@ module attributes {gpu.container_module} { } } llvm.func internal @__cudaFortranConstructor() { + %0 = cuf.register_module @cuda_device_mod -> !llvm.ptr // expected-error@+1{{'cuf.register_kernel' op only gpu.kernel llvm.func can be registered}} - cuf.register_kernel @cuda_device_mod::@_QPsub_device1 + cuf.register_kernel @cuda_device_mod::@_QPsub_device1(%0 : !llvm.ptr) llvm.return } } -- GitLab From d37bc32a65651e647148236ffb9728ea2e77eac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 18 Oct 2024 21:31:47 -0700 Subject: [PATCH 146/511] [flang][cuda] Translate cuf.register_kernel and cuf.register_module (#112972) Add LLVM IR Translation for `cuf.register_module` and `cuf.register_kernel`. These are lowered to function call to the CUF runtime entries. --- .../Dialect/CUF/CUFToLLVMIRTranslation.h | 29 +++++ .../include/flang/Optimizer/Support/InitFIR.h | 2 + .../include/flang/Runtime/CUDA/registration.h | 28 +++++ .../lib/Optimizer/Dialect/CUF/CMakeLists.txt | 1 + .../Dialect/CUF/CUFToLLVMIRTranslation.cpp | 104 ++++++++++++++++++ .../Optimizer/Transforms/CufOpConversion.cpp | 1 + flang/runtime/CUDA/CMakeLists.txt | 1 + flang/runtime/CUDA/registration.cpp | 31 ++++++ 8 files changed, 197 insertions(+) create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h create mode 100644 flang/include/flang/Runtime/CUDA/registration.h create mode 100644 flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp create mode 100644 flang/runtime/CUDA/registration.cpp diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h new file mode 100644 index 000000000000..f3edb7fca649 --- /dev/null +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h @@ -0,0 +1,29 @@ +//===- CUFToLLVMIRTranslation.h - CUF Dialect to LLVM IR --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides registration calls for GPU dialect to LLVM IR translation. +// +//===----------------------------------------------------------------------===// + +#ifndef FLANG_OPTIMIZER_DIALECT_CUF_GPUTOLLVMIRTRANSLATION_H_ +#define FLANG_OPTIMIZER_DIALECT_CUF_GPUTOLLVMIRTRANSLATION_H_ + +namespace mlir { +class DialectRegistry; +class MLIRContext; +} // namespace mlir + +namespace cuf { + +/// Register the CUF dialect and the translation from it to the LLVM IR in +/// the given registry. +void registerCUFDialectTranslation(mlir::DialectRegistry ®istry); + +} // namespace cuf + +#endif // FLANG_OPTIMIZER_DIALECT_CUF_GPUTOLLVMIRTRANSLATION_H_ diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index 04a5dd323e55..1c61c3671999 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -14,6 +14,7 @@ #define FORTRAN_OPTIMIZER_SUPPORT_INITFIR_H #include "flang/Optimizer/Dialect/CUF/CUFDialect.h" +#include "flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "mlir/Conversion/Passes.h" @@ -61,6 +62,7 @@ inline void addFIRExtensions(mlir::DialectRegistry ®istry, if (addFIRInlinerInterface) addFIRInlinerExtension(registry); addFIRToLLVMIRExtension(registry); + cuf::registerCUFDialectTranslation(registry); } inline void loadNonCodegenDialects(mlir::MLIRContext &context) { diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h new file mode 100644 index 000000000000..cbe202c4d23e --- /dev/null +++ b/flang/include/flang/Runtime/CUDA/registration.h @@ -0,0 +1,28 @@ +//===-- include/flang/Runtime/CUDA/registration.h ---------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_CUDA_REGISTRATION_H_ +#define FORTRAN_RUNTIME_CUDA_REGISTRATION_H_ + +#include "flang/Runtime/entry-names.h" +#include + +namespace Fortran::runtime::cuda { + +extern "C" { + +/// Register a CUDA module. +void *RTDECL(CUFRegisterModule)(void *data); + +/// Register a device function. +void RTDECL(CUFRegisterFunction)(void **module, const char *fct); + +} // extern "C" + +} // namespace Fortran::runtime::cuda +#endif // FORTRAN_RUNTIME_CUDA_REGISTRATION_H_ diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt index b2221199995d..5d4bd0785971 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(Attributes) add_flang_library(CUFDialect CUFDialect.cpp CUFOps.cpp + CUFToLLVMIRTranslation.cpp DEPENDS MLIRIR diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp new file mode 100644 index 000000000000..c6c9f96b8113 --- /dev/null +++ b/flang/lib/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.cpp @@ -0,0 +1,104 @@ +//===- CUFToLLVMIRTranslation.cpp - Translate CUF dialect to LLVM IR ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a translation between the MLIR CUF dialect and LLVM IR. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h" +#include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Runtime/entry-names.h" +#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace mlir; + +namespace { + +LogicalResult registerModule(cuf::RegisterModuleOp op, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + std::string binaryIdentifier = + op.getName().getLeafReference().str() + "_bin_cst"; + llvm::Module *module = moduleTranslation.getLLVMModule(); + llvm::Value *binary = module->getGlobalVariable(binaryIdentifier, true); + if (!binary) + return op.emitError() << "Couldn't find the binary: " << binaryIdentifier; + + llvm::Type *ptrTy = builder.getPtrTy(0); + llvm::FunctionCallee fct = module->getOrInsertFunction( + RTNAME_STRING(CUFRegisterModule), + llvm::FunctionType::get(ptrTy, ArrayRef({ptrTy}), false)); + auto *handle = builder.CreateCall(fct, {binary}); + moduleTranslation.mapValue(op->getResults().front()) = handle; + return mlir::success(); +} + +llvm::Value *getOrCreateFunctionName(llvm::Module *module, + llvm::IRBuilderBase &builder, + llvm::StringRef moduleName, + llvm::StringRef kernelName) { + std::string globalName = + std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, kernelName)); + + if (llvm::GlobalVariable *gv = module->getGlobalVariable(globalName)) + return gv; + + return builder.CreateGlobalString(kernelName, globalName); +} + +LogicalResult registerKernel(cuf::RegisterKernelOp op, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::Module *module = moduleTranslation.getLLVMModule(); + llvm::Type *ptrTy = builder.getPtrTy(0); + llvm::FunctionCallee fct = module->getOrInsertFunction( + RTNAME_STRING(CUFRegisterFunction), + llvm::FunctionType::get(ptrTy, ArrayRef({ptrTy, ptrTy}), + false)); + llvm::Value *modulePtr = moduleTranslation.lookupValue(op.getModulePtr()); + builder.CreateCall( + fct, {modulePtr, getOrCreateFunctionName(module, builder, + op.getKernelModuleName().str(), + op.getKernelName().str())}); + return mlir::success(); +} + +class CUFDialectLLVMIRTranslationInterface + : public LLVMTranslationDialectInterface { +public: + using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface; + + LogicalResult + convertOperation(Operation *operation, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const override { + return llvm::TypeSwitch(operation) + .Case([&](cuf::RegisterModuleOp op) { + return registerModule(op, builder, moduleTranslation); + }) + .Case([&](cuf::RegisterKernelOp op) { + return registerKernel(op, builder, moduleTranslation); + }) + .Default([&](Operation *op) { + return op->emitError("unsupported GPU operation: ") << op->getName(); + }); + } +}; + +} // namespace + +void cuf::registerCUFDialectTranslation(DialectRegistry ®istry) { + registry.insert(); + registry.addExtension(+[](MLIRContext *ctx, cuf::CUFDialect *dialect) { + dialect->addInterfaces(); + }); +} diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp index 9df559ee0ab1..629f0c69f8cb 100644 --- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp @@ -20,6 +20,7 @@ #include "flang/Runtime/CUDA/descriptor.h" #include "flang/Runtime/CUDA/memory.h" #include "flang/Runtime/allocatable.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index 193dd77e9345..86523b419f87 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -18,6 +18,7 @@ add_flang_library(${CUFRT_LIBNAME} allocatable.cpp descriptor.cpp memory.cpp + registration.cpp ) if (BUILD_SHARED_LIBS) diff --git a/flang/runtime/CUDA/registration.cpp b/flang/runtime/CUDA/registration.cpp new file mode 100644 index 000000000000..aed275e96468 --- /dev/null +++ b/flang/runtime/CUDA/registration.cpp @@ -0,0 +1,31 @@ +//===-- runtime/CUDA/registration.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/CUDA/registration.h" + +#include "cuda_runtime.h" + +namespace Fortran::runtime::cuda { + +extern "C" { + +extern void **__cudaRegisterFatBinary(void *data); +extern void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, + uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize); + +void *RTDECL(CUFRegisterModule)(void *data) { + return __cudaRegisterFatBinary(data); +} + +void RTDEF(CUFRegisterFunction)(void **module, const char *fct) { + __cudaRegisterFunction(module, fct, const_cast(fct), fct, -1, + (uint3 *)0, (uint3 *)0, (dim3 *)0, (dim3 *)0, (int *)0); +} +} +} // namespace Fortran::runtime::cuda -- GitLab From 3d84b74cb3543428c35fc39e889684497286d482 Mon Sep 17 00:00:00 2001 From: Augusto Noronha Date: Fri, 18 Oct 2024 22:57:45 -0700 Subject: [PATCH 147/511] [lldb] Add GetMangledTypeName to TypeSystem/CompilerType (#113006) Swift types have mangled names, so there should be a way to read those from the compiler type. This patch upstreams these two changes from swiftlang/llvm-project (which were added there since at least 2016). --- lldb/include/lldb/Symbol/CompilerType.h | 2 ++ lldb/include/lldb/Symbol/TypeSystem.h | 4 ++++ lldb/source/Symbol/CompilerType.cpp | 8 ++++++++ lldb/source/Symbol/TypeSystem.cpp | 4 ++++ 4 files changed, 18 insertions(+) diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index 70dacdcb7986..096a8f1ab68e 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -279,6 +279,8 @@ public: ConstString GetDisplayTypeName() const; + ConstString GetMangledTypeName() const; + uint32_t GetTypeInfo(CompilerType *pointee_or_element_compiler_type = nullptr) const; diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index 7d48f9b31613..416445a60bd0 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -237,6 +237,10 @@ public: virtual ConstString GetDisplayTypeName(lldb::opaque_compiler_type_t type) = 0; + // Defaults to GetTypeName(type). Override if your language desires + // specialized behavior. + virtual ConstString GetMangledTypeName(lldb::opaque_compiler_type_t type); + virtual uint32_t GetTypeInfo(lldb::opaque_compiler_type_t type, CompilerType *pointee_or_element_compiler_type) = 0; diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp index f8da9ef7b764..e9e6e3bf2600 100644 --- a/lldb/source/Symbol/CompilerType.cpp +++ b/lldb/source/Symbol/CompilerType.cpp @@ -540,6 +540,14 @@ ConstString CompilerType::GetDisplayTypeName() const { return ConstString(""); } +ConstString CompilerType::GetMangledTypeName() const { + if (IsValid()) { + if (auto type_system_sp = GetTypeSystem()) + return type_system_sp->GetMangledTypeName(m_type); + } + return ConstString(""); +} + uint32_t CompilerType::GetTypeInfo( CompilerType *pointee_or_element_compiler_type) const { if (IsValid()) diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp index 931ce1b0203a..f7d634ffa2de 100644 --- a/lldb/source/Symbol/TypeSystem.cpp +++ b/lldb/source/Symbol/TypeSystem.cpp @@ -157,6 +157,10 @@ bool TypeSystem::IsMeaninglessWithoutDynamicResolution(void *type) { return false; } +ConstString TypeSystem::GetMangledTypeName(void *type) { + return GetTypeName(type, false); +} + ConstString TypeSystem::DeclGetMangledName(void *opaque_decl) { return ConstString(); } -- GitLab From d8b17f2fb6129dba99c2ef843e5c38cc4414ae67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Sat, 19 Oct 2024 09:41:43 +0200 Subject: [PATCH 148/511] [GlobalISel] Combine G_UNMERGE_VALUES with anyext and build vector (#112370) G_UNMERGE_VALUES (G_ANYEXT (G_BUILD_VECTOR)) ag G_UNMERGE_VALUES llvm/test/CodeGen/AArch64/GlobalISel | grep ANYEXT [ANYEXT] is build vector or shuffle vector Prior art: https://reviews.llvm.org/D87117 https://reviews.llvm.org/D87166 https://reviews.llvm.org/D87174 https://reviews.llvm.org/D87427 ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>) ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<4 x s16>), [[UV11:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>) Test: llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 + .../CodeGen/GlobalISel/GenericMachineInstrs.h | 8 + .../include/llvm/Target/GlobalISel/Combine.td | 31 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 82 ++++ llvm/lib/Target/AArch64/AArch64Combine.td | 4 +- .../combine-shift-immed-mismatch-crash.mir | 39 +- .../GlobalISel/combine-shifts-undef.mir | 15 +- .../AArch64/GlobalISel/combine-unmerge.mir | 117 ++++- .../legalize-shuffle-vector-widen-crash.ll | 22 +- llvm/test/CodeGen/AArch64/add.ll | 46 +- llvm/test/CodeGen/AArch64/andorxor.ll | 138 +++--- .../AArch64/arm64-extract-insert-varidx.ll | 34 +- llvm/test/CodeGen/AArch64/bitcast.ll | 41 +- llvm/test/CodeGen/AArch64/concat-vector.ll | 7 +- llvm/test/CodeGen/AArch64/fptoi.ll | 10 +- .../test/CodeGen/AArch64/fptosi-sat-scalar.ll | 51 +- .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 438 ++++++++---------- .../test/CodeGen/AArch64/fptoui-sat-scalar.ll | 49 +- .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 363 +++++++-------- llvm/test/CodeGen/AArch64/load.ll | 7 +- llvm/test/CodeGen/AArch64/mul.ll | 46 +- .../AArch64/neon-bitwise-instructions.ll | 36 +- .../AArch64/neon-compare-instructions.ll | 23 +- llvm/test/CodeGen/AArch64/sext.ll | 90 ++-- llvm/test/CodeGen/AArch64/sub.ll | 46 +- llvm/test/CodeGen/AArch64/xtn.ll | 5 +- llvm/test/CodeGen/AArch64/zext.ll | 80 ++-- 27 files changed, 948 insertions(+), 884 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 76d51ab819f4..9240a3c3127e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -918,6 +918,10 @@ public: bool matchCanonicalizeICmp(const MachineInstr &MI, BuildFnTy &MatchInfo); bool matchCanonicalizeFCmp(const MachineInstr &MI, BuildFnTy &MatchInfo); + // unmerge_values(anyext(build vector)) -> build vector(anyext) + bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, + BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index d9f3f4ab3935..92d37753791c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -868,6 +868,14 @@ public: }; }; +/// Represents an any ext. +class GAnyExt : public GCastOp { +public: + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_ANYEXT; + }; +}; + /// Represents a trunc. class GTrunc : public GCastOp { public: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index d0373a7dadfc..ead4149fc110 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -420,7 +420,7 @@ def unary_undef_to_zero: GICombineRule< // replaced with undef. def propagate_undef_any_op: GICombineRule< (defs root:$root), - (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC, G_BITCAST):$root, + (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC, G_BITCAST, G_ANYEXT):$root, [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; @@ -428,7 +428,7 @@ def propagate_undef_any_op: GICombineRule< // replaced with undef. def propagate_undef_all_ops: GICombineRule< (defs root:$root), - (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + (match (wip_match_opcode G_SHUFFLE_VECTOR, G_BUILD_VECTOR):$root, [{ return Helper.matchAllExplicitUsesAreUndef(*${root}); }]), (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; @@ -832,6 +832,14 @@ def unmerge_dead_to_trunc : GICombineRule< (apply [{ Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }]) >; +// Transform unmerge any build vector -> build vector anyext +def unmerge_anyext_build_vector : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_UNMERGE_VALUES): $root, + [{ return Helper.matchUnmergeValuesAnyExtBuildVector(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) +>; + // Transform x,y = unmerge(zext(z)) -> x = zext z; y = 0. def unmerge_zext_to_zext : GICombineRule< (defs root:$d), @@ -840,6 +848,16 @@ def unmerge_zext_to_zext : GICombineRule< (apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }]) >; +def merge_combines: GICombineGroup<[ + unmerge_anyext_build_vector, + unmerge_merge, + merge_unmerge, + unmerge_cst, + unmerge_undef, + unmerge_dead_to_trunc, + unmerge_zext_to_zext +]>; + // Under certain conditions, transform: // trunc (shl x, K) -> shl (trunc x), K// // trunc ([al]shr x, K) -> (trunc ([al]shr (trunc x), K)) @@ -1851,7 +1869,6 @@ def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, propagate_undef_all_ops, propagate_undef_shuffle_mask, erase_undef_store, - unmerge_undef, insert_extract_vec_elt_out_of_bounds]>; def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, @@ -1909,10 +1926,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, reassocs, ptr_add_immed_chain, cmp_combines, shl_ashr_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, - known_bits_simplifications, + known_bits_simplifications, trunc_shift, not_cmp_fold, opt_brcond_by_inverting_cond, - unmerge_merge, unmerge_cst, unmerge_dead_to_trunc, - unmerge_zext_to_zext, merge_unmerge, trunc_shift, const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift, @@ -1920,11 +1935,11 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, constant_fold_cast_op, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - sub_add_reg, select_to_minmax, + sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, combine_concat_vector, match_addos, sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat, - combine_use_vector_truncate]>; + combine_use_vector_truncate, merge_combines]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index f9b1621955c2..b7ddf9f479ef 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7611,3 +7611,85 @@ bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI, return true; } + +bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + const GUnmerge *Unmerge = cast(&MI); + + if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg())) + return false; + + const MachineInstr *Source = MRI.getVRegDef(Unmerge->getSourceReg()); + + LLT DstTy = MRI.getType(Unmerge->getReg(0)); + + // $bv:_(<8 x s8>) = G_BUILD_VECTOR .... + // $any:_(<8 x s16>) = G_ANYEXT $bv + // $uv:_(<4 x s16>), $uv1:_(<4 x s16>) = G_UNMERGE_VALUES $any + // + // -> + // + // $any:_(s16) = G_ANYEXT $bv[0] + // $any1:_(s16) = G_ANYEXT $bv[1] + // $any2:_(s16) = G_ANYEXT $bv[2] + // $any3:_(s16) = G_ANYEXT $bv[3] + // $any4:_(s16) = G_ANYEXT $bv[4] + // $any5:_(s16) = G_ANYEXT $bv[5] + // $any6:_(s16) = G_ANYEXT $bv[6] + // $any7:_(s16) = G_ANYEXT $bv[7] + // $uv:_(<4 x s16>) = G_BUILD_VECTOR $any, $any1, $any2, $any3 + // $uv1:_(<4 x s16>) = G_BUILD_VECTOR $any4, $any5, $any6, $any7 + + // We want to unmerge into vectors. + if (!DstTy.isFixedVector()) + return false; + + const GAnyExt *Any = dyn_cast(Source); + if (!Any) + return false; + + const MachineInstr *NextSource = MRI.getVRegDef(Any->getSrcReg()); + + if (const GBuildVector *BV = dyn_cast(NextSource)) { + // G_UNMERGE_VALUES G_ANYEXT G_BUILD_VECTOR + + if (!MRI.hasOneNonDBGUse(BV->getReg(0))) + return false; + + // FIXME: check element types? + if (BV->getNumSources() % Unmerge->getNumDefs() != 0) + return false; + + LLT BigBvTy = MRI.getType(BV->getReg(0)); + LLT SmallBvTy = DstTy; + LLT SmallBvElemenTy = SmallBvTy.getElementType(); + + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_BUILD_VECTOR, {SmallBvTy, SmallBvElemenTy}})) + return false; + + // We check the legality of scalar anyext. + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ANYEXT, + {SmallBvElemenTy, BigBvTy.getElementType()}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + // Build into each G_UNMERGE_VALUES def + // a small build vector with anyext from the source build vector. + for (unsigned I = 0; I < Unmerge->getNumDefs(); ++I) { + SmallVector Ops; + for (unsigned J = 0; J < SmallBvTy.getNumElements(); ++J) { + Register SourceArray = + BV->getSourceReg(I * SmallBvTy.getNumElements() + J); + auto AnyExt = B.buildAnyExt(SmallBvElemenTy, SourceArray); + Ops.push_back(AnyExt.getReg(0)); + } + B.buildBuildVector(Unmerge->getOperand(I).getReg(), Ops); + }; + }; + return true; + }; + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 321190c83b79..8af8cdfeba6a 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -322,13 +322,13 @@ def AArch64PostLegalizerCombiner extractvecelt_pairwise_add, redundant_or, mul_const, redundant_sext_inreg, form_bitfield_extract, rotate_out_of_range, - icmp_to_true_false_known_bits, merge_unmerge, + icmp_to_true_false_known_bits, select_combines, fold_merge_to_zext, constant_fold_binops, identity_combines, ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, - commute_constant_to_rhs, + commute_constant_to_rhs, merge_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir index 96a6f18b1d41..16a8f8089784 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir @@ -9,24 +9,27 @@ liveins: body: | ; CHECK-LABEL: name: shift_immed_chain_mismatch_size_crash ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: liveins: $x0 - ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 - ; CHECK: G_BRCOND [[DEF]](s1), %bb.2 - ; CHECK: G_BR %bb.1 - ; CHECK: bb.1: - ; CHECK: successors: - ; CHECK: bb.2: - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p0) :: (load (s32) from `ptr undef`, align 8) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = nsw G_SHL [[LOAD]], [[C1]](s32) - ; CHECK: [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[SHL]], [[C]] - ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C2]](s64) - ; CHECK: $w0 = COPY [[SHL1]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: G_BRCOND [[DEF]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p0) :: (load (s32) from `ptr undef`, align 8) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = nsw G_SHL [[LOAD]], [[C1]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[SHL]], [[C]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C2]](s64) + ; CHECK-NEXT: $w0 = COPY [[SHL1]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 bb.1: liveins: $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir index d4dc24741527..236d49fc99c6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir @@ -13,9 +13,8 @@ body: | ; CHECK-LABEL: name: shl_by_ge_bw ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) - ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %1:_(s32) = COPY $w0 %0:_(s16) = G_TRUNC %1(s32) @@ -39,9 +38,8 @@ body: | ; CHECK-LABEL: name: lshr_by_ge_bw ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) - ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %1:_(s32) = COPY $w0 %0:_(s16) = G_TRUNC %1(s32) @@ -65,9 +63,8 @@ body: | ; CHECK-LABEL: name: ashr_by_ge_bw ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) - ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %1:_(s32) = COPY $w0 %0:_(s16) = G_TRUNC %1(s32) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index c2c6e04d2d0c..7566d38e6c6c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -54,9 +54,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_build_vector ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32) @@ -74,11 +73,9 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_unmerge_buildvector_3ops ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) - ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32) - ; CHECK-NEXT: $w2 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) + ; CHECK-NEXT: $w2 = COPY [[DEF]](s32) %0:_(s32) = G_IMPLICIT_DEF %1:_(s32) = G_IMPLICIT_DEF %5:_(s32) = G_IMPLICIT_DEF @@ -434,3 +431,111 @@ body: | $w0 = COPY %1(s32) $w1 = COPY %2(s32) ... + +# Check that we unmerge the build vector on the anyext +--- +name: test_anyext_buildvector +body: | + bb.1: + ; CHECK-LABEL: name: test_anyext_buildvector + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32) + ; CHECK-NEXT: %un1:_(<2 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64) + ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY2]](s32) + ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY3]](s32) + ; CHECK-NEXT: %un2:_(<2 x s64>) = G_BUILD_VECTOR [[ANYEXT2]](s64), [[ANYEXT3]](s64) + ; CHECK-NEXT: $q0 = COPY %un1(<2 x s64>) + ; CHECK-NEXT: $q1 = COPY %un2(<2 x s64>) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w0 + %2:_(s32) = COPY $w0 + %3:_(s32) = COPY $w0 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + %un1:_(<2 x s64>), %un2:_(<2 x s64>) = G_UNMERGE_VALUES %any(<4 x s64>) + $q0 = COPY %un1(<2 x s64>) + $q1 = COPY %un2(<2 x s64>) +... + +# Check that we unmerge the build vector on the anyext and undef +--- +name: test_anyext_buildvector_undef +body: | + bb.1: + ; CHECK-LABEL: name: test_anyext_buildvector_undef + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32) + ; CHECK-NEXT: %un1:_(<2 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64) + ; CHECK-NEXT: %un2:_(<2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: $q0 = COPY %un1(<2 x s64>) + ; CHECK-NEXT: $q1 = COPY %un2(<2 x s64>) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_IMPLICIT_DEF + %3:_(s32) = G_IMPLICIT_DEF + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + %un1:_(<2 x s64>), %un2:_(<2 x s64>) = G_UNMERGE_VALUES %any(<4 x s64>) + $q0 = COPY %un1(<2 x s64>) + $q1 = COPY %un2(<2 x s64>) +... + +# Check that we don't unmerge the build vector on the anyext, multi-use +--- +name: test_anyext_buildvector_multi +body: | + bb.1: + ; CHECK-LABEL: name: test_anyext_buildvector_multi + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[DEF]](s32), [[DEF1]](s32) + ; CHECK-NEXT: %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + ; CHECK-NEXT: %un1:_(<2 x s64>), %un2:_(<2 x s64>) = G_UNMERGE_VALUES %any(<4 x s64>) + ; CHECK-NEXT: $q0 = COPY %un1(<2 x s64>) + ; CHECK-NEXT: $q1 = COPY %un2(<2 x s64>) + ; CHECK-NEXT: $q2 = COPY %bv(<4 x s32>) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_IMPLICIT_DEF + %3:_(s32) = G_IMPLICIT_DEF + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + %un1:_(<2 x s64>), %un2:_(<2 x s64>) = G_UNMERGE_VALUES %any(<4 x s64>) + $q0 = COPY %un1(<2 x s64>) + $q1 = COPY %un2(<2 x s64>) + $q2 = COPY %bv(<4 x s32>) +... + +# Check that we don't unmerge the build vector on the anyext into scalar +--- +name: test_anyext_buildvector_scalar +body: | + bb.1: + ; CHECK-LABEL: name: test_anyext_buildvector_scalar + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + ; CHECK-NEXT: %un1:_(s128), %un2:_(s128) = G_UNMERGE_VALUES %any(<4 x s64>) + ; CHECK-NEXT: $q0 = COPY %un1(s128) + ; CHECK-NEXT: $q1 = COPY %un2(s128) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w0 + %2:_(s32) = COPY $w0 + %3:_(s32) = COPY $w0 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32) + %any:_(<4 x s64>) = G_ANYEXT %bv(<4 x s32>) + %un1:_(s128), %un2:_(s128) = G_UNMERGE_VALUES %any(<4 x s64>) + $q0 = COPY %un1(s128) + $q1 = COPY %un2(s128) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll index 87c1307ad295..be80886ed3ef 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll @@ -9,18 +9,16 @@ define i32 @bar() { ; CHECK-LABEL: bar: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov b1, v0[1] -; CHECK-NEXT: mov b2, v0[3] -; CHECK-NEXT: mov b3, v0[2] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov.h v0[1], w8 -; CHECK-NEXT: mov.h v3[1], w9 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll.4s v1, v3, #0 -; CHECK-NEXT: mov.d v0[1], v1[0] -; CHECK-NEXT: movi.4s v1, #1 -; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: umov.b w8, v0[0] +; CHECK-NEXT: umov.b w9, v0[1] +; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: umov.b w8, v0[2] +; CHECK-NEXT: mov.s v1[1], w9 +; CHECK-NEXT: umov.b w9, v0[3] +; CHECK-NEXT: movi.4s v0, #1 +; CHECK-NEXT: mov.s v1[2], w8 +; CHECK-NEXT: mov.s v1[3], w9 +; CHECK-NEXT: and.16b v0, v1, v0 ; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index ce7e3101a7a5..e3072dc41d93 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -155,21 +155,23 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -238,14 +240,12 @@ define void @v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 459daece90de..5c7429aebb31 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -447,21 +447,23 @@ define void @and_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -494,21 +496,23 @@ define void @or_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -541,21 +545,23 @@ define void @xor_v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -698,14 +704,12 @@ define void @and_v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: and_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -737,14 +741,12 @@ define void @or_v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: or_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] @@ -776,14 +778,12 @@ define void @xor_v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: xor_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll index 8611532d6ea9..7a4cdd52db90 100644 --- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll +++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll @@ -29,21 +29,23 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) { ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GISEL-NEXT: mov b1, v0.b[1] ; CHECK-GISEL-NEXT: add x8, sp, #8 -; CHECK-GISEL-NEXT: str d0, [sp, #8] ; CHECK-GISEL-NEXT: and x9, x9, #0x7 -; CHECK-GISEL-NEXT: mov b2, v0.b[1] -; CHECK-GISEL-NEXT: mov b3, v0.b[2] +; CHECK-GISEL-NEXT: str d0, [sp, #8] +; CHECK-GISEL-NEXT: mov b2, v0.b[2] ; CHECK-GISEL-NEXT: lsl x10, x9, #1 ; CHECK-GISEL-NEXT: mov b0, v0.b[3] ; CHECK-GISEL-NEXT: sub x9, x10, x9 -; CHECK-GISEL-NEXT: ldr b1, [x8, x9] -; CHECK-GISEL-NEXT: mov v1.b[0], v1.b[0] -; CHECK-GISEL-NEXT: mov v1.b[1], v2.b[0] -; CHECK-GISEL-NEXT: mov v1.b[2], v3.b[0] -; CHECK-GISEL-NEXT: mov v1.b[3], v0.b[0] -; CHECK-GISEL-NEXT: ushll v0.8h, v1.8b, #0 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: ldrb w8, [x8, x9] +; CHECK-GISEL-NEXT: fmov w9, s1 +; CHECK-GISEL-NEXT: fmov s1, w8 +; CHECK-GISEL-NEXT: fmov w8, s2 +; CHECK-GISEL-NEXT: mov v1.h[1], w9 +; CHECK-GISEL-NEXT: mov v1.h[2], w8 +; CHECK-GISEL-NEXT: fmov w8, s0 +; CHECK-GISEL-NEXT: mov v1.h[3], w8 +; CHECK-GISEL-NEXT: fmov d0, d1 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret %tmp = extractelement <8 x i8> %x, i32 %idx @@ -179,13 +181,15 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) { ; CHECK-GISEL-NEXT: sub sp, sp, #16 ; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GISEL-NEXT: mov w9, w0 +; CHECK-GISEL-NEXT: mov w8, #2 // =0x2 +; CHECK-GISEL-NEXT: add x10, sp, #8 +; CHECK-GISEL-NEXT: and x9, x9, #0x3 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GISEL-NEXT: add x8, sp, #8 ; CHECK-GISEL-NEXT: str d0, [sp, #8] -; CHECK-GISEL-NEXT: and x9, x9, #0x3 -; CHECK-GISEL-NEXT: ldr h1, [x8, x9, lsl #1] -; CHECK-GISEL-NEXT: mov v1.h[1], v0.h[1] -; CHECK-GISEL-NEXT: ushll v0.4s, v1.4h, #0 +; CHECK-GISEL-NEXT: madd x8, x9, x8, x10 +; CHECK-GISEL-NEXT: umov w9, v0.h[1] +; CHECK-GISEL-NEXT: ld1 { v0.h }[0], [x8] +; CHECK-GISEL-NEXT: mov v0.s[1], w9 ; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GISEL-NEXT: add sp, sp, #16 ; CHECK-GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index bbdf8b0a13d3..39f2572d9fd3 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -81,13 +81,14 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ ; CHECK-GI-NEXT: add w8, w0, w1 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b3, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov b1, v0.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.h[3], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add i32 %a, %b @@ -134,8 +135,9 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ ; CHECK-GI-NEXT: add w8, w0, w1 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add i32 %a, %b @@ -414,13 +416,14 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b3, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov b1, v0.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.h[3], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add <2 x i16> %a, %b @@ -449,8 +452,10 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %c = add <4 x i8> %a, %b diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 0033999b9bd5..41b336bc3e8c 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -14,11 +14,10 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) { ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: mov v0.h[1], w8 ; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: mov v0.b[3], w9 -; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.h[3], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index c436c410a4e3..9c4f0207b84c 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -7616,10 +7616,9 @@ define <2 x i16> @fptos_v2f128_v2i16(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], w0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #32 ; CHECK-GI-NEXT: ret @@ -7660,10 +7659,9 @@ define <2 x i16> @fptou_v2f128_v2i16(<2 x fp128> %a) { ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: fmov s0, w19 +; CHECK-GI-NEXT: mov v0.s[0], w19 ; CHECK-GI-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.h[1], w0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #32 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll index 9c52b024d3e2..17c87a5dae41 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll @@ -977,51 +977,46 @@ define i32 @test_signed_f128_i32(fp128 %f) { ; ; CHECK-GI-LABEL: test_signed_f128_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #64 -; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 ; CHECK-GI-NEXT: .cfi_offset w19, -8 -; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 ; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] -; CHECK-GI-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d1, v3.d[1] -; CHECK-GI-NEXT: fcsel d8, d2, d3, lt -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: fcsel d9, d0, d1, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov x9, #-4603241769126068224 // =0xc01e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x9, lt ; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_0] -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d1, d8, d1, gt -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fcsel d2, d9, d0, gt +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x8, #281474976448512 // =0xfffffffc0000 +; CHECK-GI-NEXT: movk x8, #16413, lsl #48 +; CHECK-GI-NEXT: csel x8, x20, x8, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NEXT: csel w0, wzr, w19, ne -; CHECK-GI-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #48 ; CHECK-GI-NEXT: ret %x = call i32 @llvm.fptosi.sat.i32.f128(fp128 %f) ret i32 %x diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 29a9082173ea..9ef6d61c350e 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -525,53 +525,48 @@ define <1 x i32> @test_signed_v1f128_v1i32(<1 x fp128> %f) { ; ; CHECK-GI-LABEL: test_signed_v1f128_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #64 -; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 ; CHECK-GI-NEXT: .cfi_offset w19, -8 -; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 ; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] -; CHECK-GI-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d1, v3.d[1] -; CHECK-GI-NEXT: fcsel d8, d2, d3, lt -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: fcsel d9, d0, d1, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov x9, #-4603241769126068224 // =0xc01e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x9, lt ; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d1, d8, d1, gt -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fcsel d2, d9, d0, gt +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x8, #281474976448512 // =0xfffffffc0000 +; CHECK-GI-NEXT: movk x8, #16413, lsl #48 +; CHECK-GI-NEXT: csel x8, x20, x8, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NEXT: csel w8, wzr, w19, ne -; CHECK-GI-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[0], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: add sp, sp, #48 ; CHECK-GI-NEXT: ret %x = call <1 x i32> @llvm.fptosi.sat.v1f128.v1i32(<1 x fp128> %f) ret <1 x i32> %x @@ -645,92 +640,82 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) { ; ; CHECK-GI-LABEL: test_signed_v2f128_v2i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #128 -; CHECK-GI-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 128 +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 -; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: .cfi_offset b8, -40 -; CHECK-GI-NEXT: .cfi_offset b9, -48 -; CHECK-GI-NEXT: .cfi_offset b10, -56 -; CHECK-GI-NEXT: .cfi_offset b11, -64 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 ; CHECK-GI-NEXT: adrp x8, .LCPI15_1 -; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] -; CHECK-GI-NEXT: stp q2, q1, [sp, #32] // 32-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: str q2, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d9, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: fcsel d10, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov x20, #-4603241769126068224 // =0xc01e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x21, x8, x20, lt ; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: mov v0.d[1], x21 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] -; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d11, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d9, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d10, d11, gt +; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x22, #16413, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x21, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w20, wzr, w19, ne +; CHECK-GI-NEXT: csel w21, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d9, d1, d2, lt -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d11, gt -; CHECK-GI-NEXT: fcsel d0, d9, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: mov v0.s[0], w20 +; CHECK-GI-NEXT: mov v0.s[0], w21 ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-GI-NEXT: csel w8, wzr, w19, ne -; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: add sp, sp, #128 +; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret %x = call <2 x i32> @llvm.fptosi.sat.v2f128.v2i32(<2 x fp128> %f) ret <2 x i32> %x @@ -825,124 +810,107 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) { ; ; CHECK-GI-LABEL: test_signed_v3f128_v3i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #144 -; CHECK-GI-NEXT: stp d11, d10, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x21, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 144 +; CHECK-GI-NEXT: sub sp, sp, #128 +; CHECK-GI-NEXT: stp x30, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 128 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 -; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: .cfi_offset b8, -40 -; CHECK-GI-NEXT: .cfi_offset b9, -48 -; CHECK-GI-NEXT: .cfi_offset b10, -56 -; CHECK-GI-NEXT: .cfi_offset b11, -64 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w30, -48 ; CHECK-GI-NEXT: adrp x8, .LCPI16_1 -; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] -; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q2, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov x20, #-4603241769126068224 // =0xc01e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x21, x8, x20, lt ; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: mov v0.d[1], x21 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d9, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt +; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x22, #16413, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x21, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w20, wzr, w19, ne +; CHECK-GI-NEXT: csel w21, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d1, d2, lt -; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: bl __gttf2 ; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x23, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x23 +; CHECK-GI-NEXT: bl __gttf2 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x23, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w21, wzr, w19, ne +; CHECK-GI-NEXT: csel w23, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v3.d[1] -; CHECK-GI-NEXT: fcsel d10, d3, d2, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: mov v0.s[0], w20 +; CHECK-GI-NEXT: mov v0.s[0], w21 ; CHECK-GI-NEXT: cmp w0, #0 ; CHECK-GI-NEXT: csel w8, wzr, w19, ne -; CHECK-GI-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w21 -; CHECK-GI-NEXT: ldp x30, x21, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[1], w23 +; CHECK-GI-NEXT: ldp x30, x23, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: add sp, sp, #144 +; CHECK-GI-NEXT: add sp, sp, #128 ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f128.v3i32(<3 x fp128> %f) ret <3 x i32> %x @@ -1057,52 +1025,44 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) { ; ; CHECK-GI-LABEL: test_signed_v4f128_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #176 -; CHECK-GI-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #128] // 8-byte Folded Spill -; CHECK-GI-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 176 +; CHECK-GI-NEXT: sub sp, sp, #160 +; CHECK-GI-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 160 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NEXT: .cfi_offset w22, -32 -; CHECK-GI-NEXT: .cfi_offset w30, -48 -; CHECK-GI-NEXT: .cfi_offset b8, -56 -; CHECK-GI-NEXT: .cfi_offset b9, -64 -; CHECK-GI-NEXT: .cfi_offset b10, -72 -; CHECK-GI-NEXT: .cfi_offset b11, -80 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w30, -64 ; CHECK-GI-NEXT: adrp x8, .LCPI17_1 ; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] ; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: str q3, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: str q1, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q1, q3, [sp, #64] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov x20, #-4603241769126068224 // =0xc01e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x21, x8, x20, lt ; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: mov v0.d[1], x21 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] -; CHECK-GI-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d9, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt +; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x22, #16413, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x21, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -1110,28 +1070,24 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) { ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w20, wzr, w19, ne +; CHECK-GI-NEXT: csel w21, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q1, q4, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d4, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x23, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x23 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x23, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload @@ -1139,76 +1095,64 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) { ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w21, wzr, w19, ne +; CHECK-GI-NEXT: csel w23, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d1, d2, lt -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x24, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x24 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x24, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q0, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: csel w22, wzr, w19, ne +; CHECK-GI-NEXT: csel w24, wzr, w19, ne ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q5, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v5.d[1] -; CHECK-GI-NEXT: fcsel d10, d5, d2, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixtfsi -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: mov v1.16b, v0.16b ; CHECK-GI-NEXT: bl __unordtf2 -; CHECK-GI-NEXT: mov v0.s[0], w20 +; CHECK-GI-NEXT: mov v0.s[0], w21 ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-GI-NEXT: csel w8, wzr, w19, ne -; CHECK-GI-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v0.s[1], w21 -; CHECK-GI-NEXT: mov v0.s[2], w22 -; CHECK-GI-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.s[1], w23 +; CHECK-GI-NEXT: mov v0.s[2], w24 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[3], w8 -; CHECK-GI-NEXT: add sp, sp, #176 +; CHECK-GI-NEXT: add sp, sp, #160 ; CHECK-GI-NEXT: ret %x = call <4 x i32> @llvm.fptosi.sat.v4f128.v4i32(<4 x fp128> %f) ret <4 x i32> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll index 60f961fa8f94..3c19fca4a22a 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll @@ -787,43 +787,38 @@ define i32 @test_unsigned_f128_i32(fp128 %f) { ; ; CHECK-GI-LABEL: test_unsigned_f128_i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #64 -; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 -; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 ; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] -; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v3.d[1] -; CHECK-GI-NEXT: mov d1, v2.d[1] -; CHECK-GI-NEXT: fcsel d8, d3, d2, lt -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: fcsel d9, d0, d1, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt ; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_0] -; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d1, d8, d1, gt -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fcsel d2, d9, d0, gt -; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x8, #281474976579584 // =0xfffffffe0000 +; CHECK-GI-NEXT: movk x8, #16414, lsl #48 +; CHECK-GI-NEXT: csel x8, x20, x8, gt +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: add sp, sp, #48 ; CHECK-GI-NEXT: b __fixunstfsi %x = call i32 @llvm.fptoui.sat.i32.f128(fp128 %f) ret i32 %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index 046ec0d07902..e1670ad2dc05 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -481,46 +481,41 @@ define <1 x i32> @test_unsigned_v1f128_v1i32(<1 x fp128> %f) { ; ; CHECK-GI-LABEL: test_unsigned_v1f128_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #64 -; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 -; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 ; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] -; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q2, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v3.d[1] -; CHECK-GI-NEXT: mov d1, v2.d[1] -; CHECK-GI-NEXT: fcsel d8, d3, d2, lt -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: fcsel d9, d0, d1, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt ; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] -; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v1.d[1] -; CHECK-GI-NEXT: fcsel d1, d8, d1, gt -; CHECK-GI-NEXT: fmov x8, d1 -; CHECK-GI-NEXT: fcsel d2, d9, d0, gt +; CHECK-GI-NEXT: csel x8, x19, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov x8, #281474976579584 // =0xfffffffe0000 +; CHECK-GI-NEXT: movk x8, #16414, lsl #48 +; CHECK-GI-NEXT: csel x8, x20, x8, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[0], w0 -; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: add sp, sp, #48 ; CHECK-GI-NEXT: ret %x = call <1 x i32> @llvm.fptoui.sat.v1f128.v1i32(<1 x fp128> %f) ret <1 x i32> %x @@ -579,75 +574,64 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) { ; CHECK-GI-LABEL: test_unsigned_v2f128_v2i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #96 -; CHECK-GI-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 ; CHECK-GI-NEXT: .cfi_offset w19, -8 -; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: .cfi_offset b10, -40 -; CHECK-GI-NEXT: .cfi_offset b11, -48 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 ; CHECK-GI-NEXT: adrp x8, .LCPI15_1 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] -; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q2, q1, [sp, #16] // 32-byte Folded Spill ; CHECK-GI-NEXT: mov v1.16b, v2.16b -; CHECK-GI-NEXT: str q2, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d9, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: fcsel d10, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt ; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] -; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d11, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d9, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d10, d11, gt +; CHECK-GI-NEXT: mov x21, #281474976579584 // =0xfffffffe0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x21, #16414, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q3, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v3.d[1] -; CHECK-GI-NEXT: fcsel d9, d3, d2, lt -; CHECK-GI-NEXT: fmov x8, d9 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x20 +; CHECK-GI-NEXT: csel x22, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x22 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d11, gt -; CHECK-GI-NEXT: fcsel d0, d9, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x20, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x22, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: mov v0.s[0], w19 -; CHECK-GI-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w0 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: add sp, sp, #96 @@ -723,106 +707,87 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) { ; ; CHECK-GI-LABEL: test_unsigned_v3f128_v3i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sub sp, sp, #128 -; CHECK-GI-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: str x30, [sp, #96] // 8-byte Folded Spill -; CHECK-GI-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 128 +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: stp x30, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 -; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: .cfi_offset b8, -40 -; CHECK-GI-NEXT: .cfi_offset b9, -48 -; CHECK-GI-NEXT: .cfi_offset b10, -56 -; CHECK-GI-NEXT: .cfi_offset b11, -64 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w30, -48 ; CHECK-GI-NEXT: adrp x8, .LCPI16_1 -; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] -; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: str q2, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp q1, q2, [sp, #32] // 32-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt ; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-GI-NEXT: str q1, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d9, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt +; CHECK-GI-NEXT: mov x21, #281474976579584 // =0xfffffffe0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x21, #16414, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q1, q3, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d3, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x20 +; CHECK-GI-NEXT: csel x22, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x22 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x20, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x22, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi -; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q4, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v4.d[1] -; CHECK-GI-NEXT: fcsel d10, d4, d2, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x22, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x22 +; CHECK-GI-NEXT: csel x23, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x23 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x22, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x23, x21, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: mov v0.s[0], w19 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x30, x23, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w20 -; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[2], w0 -; CHECK-GI-NEXT: add sp, sp, #128 +; CHECK-GI-NEXT: add sp, sp, #112 ; CHECK-GI-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f128.v3i32(<3 x fp128> %f) ret <3 x i32> %x @@ -912,19 +877,18 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) { ; CHECK-GI-LABEL: test_unsigned_v4f128_v4i32: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #144 -; CHECK-GI-NEXT: stp d11, d10, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp d9, d8, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: stp x30, x21, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill ; CHECK-GI-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill ; CHECK-GI-NEXT: .cfi_def_cfa_offset 144 ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w21, -24 -; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: .cfi_offset b8, -40 -; CHECK-GI-NEXT: .cfi_offset b9, -48 -; CHECK-GI-NEXT: .cfi_offset b10, -56 -; CHECK-GI-NEXT: .cfi_offset b11, -64 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w30, -64 ; CHECK-GI-NEXT: adrp x8, .LCPI17_1 ; CHECK-GI-NEXT: stp q1, q2, [sp] // 32-byte Folded Spill ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] @@ -932,109 +896,92 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) { ; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: str q1, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q2, q1, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: mov d8, v1.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d1, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt ; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: mov d9, v0.d[1] -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt +; CHECK-GI-NEXT: mov x22, #281474976579584 // =0xfffffffe0000 +; CHECK-GI-NEXT: csel x8, x19, xzr, gt +; CHECK-GI-NEXT: movk x22, #16414, lsl #48 ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x20, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w19, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q1, q4, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d4, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x20 +; CHECK-GI-NEXT: csel x21, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x21 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x20, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x21, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w20, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q1, q5, [sp, #48] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v2.d[1] -; CHECK-GI-NEXT: fcsel d10, d2, d5, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d11, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d11 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x21, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x21 +; CHECK-GI-NEXT: csel x23, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x23 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d11, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x21, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x23, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov w21, w0 ; CHECK-GI-NEXT: bl __getf2 -; CHECK-GI-NEXT: ldp q6, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov d0, v6.d[1] -; CHECK-GI-NEXT: fcsel d10, d6, d2, lt -; CHECK-GI-NEXT: fmov x8, d10 -; CHECK-GI-NEXT: fcsel d8, d0, d8, lt -; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d8 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x23, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x23 +; CHECK-GI-NEXT: csel x24, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x24 ; CHECK-GI-NEXT: bl __gttf2 -; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: cmp w0, #0 -; CHECK-GI-NEXT: fcsel d1, d8, d9, gt -; CHECK-GI-NEXT: fcsel d0, d10, d0, gt -; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x8, x23, xzr, gt ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: csel x8, x24, x22, gt ; CHECK-GI-NEXT: mov v0.d[1], x8 ; CHECK-GI-NEXT: bl __fixunstfsi ; CHECK-GI-NEXT: mov v0.s[0], w19 -; CHECK-GI-NEXT: ldp d9, d8, [sp, #96] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[1], w20 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[2], w21 -; CHECK-GI-NEXT: ldp x30, x21, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v0.s[3], w0 ; CHECK-GI-NEXT: add sp, sp, #144 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index 167e9d1c1964..70ab10e71687 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -157,10 +157,9 @@ define <2 x i16> @load_v2i16(ptr %ptr){ ; ; CHECK-GI-LABEL: load_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %a = load <2 x i16>, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 5e7f71c18c27..9ca975d9e742 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -167,21 +167,23 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -250,14 +252,12 @@ define void @v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index dbb4270fb800..f6dbf5251fc2 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1120,10 +1120,9 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) { ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v1.b[1], w9 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: mov v1.b[3], w8 -; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: mov v1.h[3], w8 ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #15 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #15 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b @@ -1144,13 +1143,10 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v2.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w9 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: mov v1.d[1], v2.d[0] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: mov v1.s[3], w8 ; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 ; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b @@ -1196,10 +1192,9 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 ; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: mov v2.b[1], w9 -; CHECK-GI-NEXT: mov v2.b[2], w9 -; CHECK-GI-NEXT: mov v2.b[3], w8 -; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: mov v2.h[3], w8 ; CHECK-GI-NEXT: shl v2.4h, v2.4h, #15 ; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #15 ; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b @@ -1220,13 +1215,10 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: mov w9, #0 // =0x0 -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: mov v3.h[1], w8 -; CHECK-GI-NEXT: mov v2.h[1], w9 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] +; CHECK-GI-NEXT: mov v2.s[0], w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mov v2.s[2], w9 +; CHECK-GI-NEXT: mov v2.s[3], w8 ; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31 ; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #31 ; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll index adc89f7a0d99..8f7d5dd5588b 100644 --- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -2672,14 +2672,9 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) { ; CHECK-GI-LABEL: fcmal4xfloat: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #1 // =0x1 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.16b, v0.16b -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: dup v0.2s, w8 +; CHECK-GI-NEXT: mov v0.d[1], v0.d[0] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-GI-NEXT: ret %tmp3 = fcmp true <4 x float> %A, %B @@ -2723,14 +2718,10 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) { ; CHECK-GI-LABEL: fcmnv4xfloat: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #0 // =0x0 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.16b, v0.16b -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] -; CHECK-GI-NEXT: shl v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: mov v0.s[0], w8 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v0.d[1], v0.d[0] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-GI-NEXT: ret %tmp3 = fcmp false <4 x float> %A, %B diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 0f256c1f18f5..853ed92c91fb 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -1198,58 +1198,50 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: ldr w10, [sp, #32] -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: ldr w9, [sp, #8] -; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: fmov s3, w10 -; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], w1 -; CHECK-GI-NEXT: mov v1.h[1], w5 -; CHECK-GI-NEXT: mov v2.h[1], w9 -; CHECK-GI-NEXT: mov v3.h[1], w11 -; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v0.h[2], w2 -; CHECK-GI-NEXT: mov v1.h[2], w6 -; CHECK-GI-NEXT: mov v2.h[2], w8 -; CHECK-GI-NEXT: mov v3.h[2], w9 -; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v0.h[3], w3 -; CHECK-GI-NEXT: mov v1.h[3], w7 -; CHECK-GI-NEXT: mov v2.h[3], w8 -; CHECK-GI-NEXT: mov v3.h[3], w9 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll v5.2d, v1.2s, #0 -; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-GI-NEXT: ushll v6.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll2 v2.2d, v2.4s, #0 -; CHECK-GI-NEXT: ushll v7.2d, v3.2s, #0 -; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 -; CHECK-GI-NEXT: shl v4.2d, v4.2d, #54 -; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54 +; CHECK-GI-NEXT: mov v1.s[0], w0 +; CHECK-GI-NEXT: mov v2.s[0], w2 +; CHECK-GI-NEXT: ldr s0, [sp] +; CHECK-GI-NEXT: mov v3.s[0], w4 +; CHECK-GI-NEXT: mov v4.s[0], w6 +; CHECK-GI-NEXT: ldr s5, [sp, #8] +; CHECK-GI-NEXT: ldr s6, [sp, #16] +; CHECK-GI-NEXT: ldr s7, [sp, #24] +; CHECK-GI-NEXT: ldr s16, [sp, #32] +; CHECK-GI-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NEXT: ldr s18, [sp, #48] +; CHECK-GI-NEXT: ldr s19, [sp, #56] +; CHECK-GI-NEXT: mov v1.s[1], w1 +; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] +; CHECK-GI-NEXT: mov v2.s[1], w3 +; CHECK-GI-NEXT: mov v3.s[1], w5 +; CHECK-GI-NEXT: mov v4.s[1], w7 +; CHECK-GI-NEXT: mov v6.s[1], v7.s[0] +; CHECK-GI-NEXT: mov v16.s[1], v17.s[0] +; CHECK-GI-NEXT: mov v18.s[1], v19.s[0] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll v3.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-GI-NEXT: ushll v5.2d, v6.2s, #0 +; CHECK-GI-NEXT: ushll v6.2d, v16.2s, #0 +; CHECK-GI-NEXT: ushll v7.2d, v18.2s, #0 +; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54 +; CHECK-GI-NEXT: shl v1.2d, v1.2d, #54 +; CHECK-GI-NEXT: shl v2.2d, v2.2d, #54 +; CHECK-GI-NEXT: shl v3.2d, v3.2d, #54 +; CHECK-GI-NEXT: shl v16.2d, v4.2d, #54 ; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54 -; CHECK-GI-NEXT: shl v17.2d, v1.2d, #54 ; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54 -; CHECK-GI-NEXT: shl v18.2d, v2.2d, #54 ; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54 -; CHECK-GI-NEXT: shl v19.2d, v3.2d, #54 -; CHECK-GI-NEXT: sshr v0.2d, v4.2d, #54 -; CHECK-GI-NEXT: sshr v1.2d, v16.2d, #54 -; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54 -; CHECK-GI-NEXT: sshr v3.2d, v17.2d, #54 -; CHECK-GI-NEXT: sshr v4.2d, v6.2d, #54 -; CHECK-GI-NEXT: sshr v5.2d, v18.2d, #54 -; CHECK-GI-NEXT: sshr v6.2d, v7.2d, #54 -; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54 +; CHECK-GI-NEXT: sshr v4.2d, v0.2d, #54 +; CHECK-GI-NEXT: sshr v0.2d, v1.2d, #54 +; CHECK-GI-NEXT: sshr v1.2d, v2.2d, #54 +; CHECK-GI-NEXT: sshr v2.2d, v3.2d, #54 +; CHECK-GI-NEXT: sshr v3.2d, v16.2d, #54 +; CHECK-GI-NEXT: sshr v5.2d, v5.2d, #54 +; CHECK-GI-NEXT: sshr v6.2d, v6.2d, #54 +; CHECK-GI-NEXT: sshr v7.2d, v7.2d, #54 ; CHECK-GI-NEXT: ret entry: %c = sext <16 x i10> %a to <16 x i64> diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index c298e6d8a1ff..8f35a69f52b8 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -155,21 +155,23 @@ define void @v4i8(ptr %p1, ptr %p2) { ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 ; CHECK-GI-NEXT: mov b2, v0.b[1] -; CHECK-GI-NEXT: mov v3.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[1] -; CHECK-GI-NEXT: mov v5.b[0], v1.b[0] -; CHECK-GI-NEXT: mov v3.b[1], v2.b[0] -; CHECK-GI-NEXT: mov b2, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v5.b[1], v4.b[0] -; CHECK-GI-NEXT: mov b4, v1.b[2] -; CHECK-GI-NEXT: mov b1, v1.b[3] -; CHECK-GI-NEXT: mov v3.b[2], v2.b[0] -; CHECK-GI-NEXT: mov v5.b[2], v4.b[0] -; CHECK-GI-NEXT: mov v3.b[3], v0.b[0] -; CHECK-GI-NEXT: mov v5.b[3], v1.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v3.8b, #0 -; CHECK-GI-NEXT: ushll v1.8h, v5.8b, #0 +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov b3, v1.b[3] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w8, s5 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: mov v1.h[3], w9 ; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: fmov w8, s0 @@ -238,14 +240,12 @@ define void @v2i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-GI-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[1], v3.s[0] ; CHECK-GI-NEXT: sub v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: mov s1, v0.s[1] ; CHECK-GI-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index fb3f8ebd7d14..8a4d6b8c7b78 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -174,9 +174,8 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) { ; ; CHECK-GI-LABEL: xtn_v2i128_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov v0.h[1], w2 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v0.s[1], w2 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 7e95b6684e82..0d5010113ce0 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -1169,52 +1169,44 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr w8, [sp] -; CHECK-GI-NEXT: ldr w10, [sp, #32] -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w4 -; CHECK-GI-NEXT: ldr w9, [sp, #8] -; CHECK-GI-NEXT: ldr w11, [sp, #40] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: fmov s3, w10 -; CHECK-GI-NEXT: ldr w8, [sp, #16] -; CHECK-GI-NEXT: mov v0.h[1], w1 -; CHECK-GI-NEXT: mov v1.h[1], w5 -; CHECK-GI-NEXT: mov v2.h[1], w9 -; CHECK-GI-NEXT: mov v3.h[1], w11 -; CHECK-GI-NEXT: ldr w9, [sp, #48] -; CHECK-GI-NEXT: mov v0.h[2], w2 -; CHECK-GI-NEXT: mov v1.h[2], w6 -; CHECK-GI-NEXT: mov v2.h[2], w8 -; CHECK-GI-NEXT: mov v3.h[2], w9 -; CHECK-GI-NEXT: ldr w8, [sp, #24] -; CHECK-GI-NEXT: ldr w9, [sp, #56] -; CHECK-GI-NEXT: mov v0.h[3], w3 -; CHECK-GI-NEXT: mov v1.h[3], w7 -; CHECK-GI-NEXT: mov v2.h[3], w8 -; CHECK-GI-NEXT: mov v3.h[3], w9 +; CHECK-GI-NEXT: mov v0.s[0], w0 +; CHECK-GI-NEXT: mov v1.s[0], w2 +; CHECK-GI-NEXT: ldr s3, [sp] +; CHECK-GI-NEXT: mov v2.s[0], w4 +; CHECK-GI-NEXT: mov v5.s[0], w6 +; CHECK-GI-NEXT: ldr s4, [sp, #8] +; CHECK-GI-NEXT: ldr s6, [sp, #16] +; CHECK-GI-NEXT: ldr s7, [sp, #24] +; CHECK-GI-NEXT: ldr s16, [sp, #32] +; CHECK-GI-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NEXT: ldr s18, [sp, #48] +; CHECK-GI-NEXT: ldr s19, [sp, #56] +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v1.s[1], w3 +; CHECK-GI-NEXT: mov v3.s[1], v4.s[0] +; CHECK-GI-NEXT: mov v2.s[1], w5 +; CHECK-GI-NEXT: mov v5.s[1], w7 +; CHECK-GI-NEXT: mov v6.s[1], v7.s[0] +; CHECK-GI-NEXT: mov v16.s[1], v17.s[0] +; CHECK-GI-NEXT: mov v18.s[1], v19.s[0] ; CHECK-GI-NEXT: adrp x8, .LCPI54_0 ; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0] -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0 -; CHECK-GI-NEXT: ushll v6.2d, v1.2s, #0 -; CHECK-GI-NEXT: ushll2 v16.2d, v1.4s, #0 -; CHECK-GI-NEXT: ushll v17.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll2 v18.2d, v2.4s, #0 -; CHECK-GI-NEXT: ushll v19.2d, v3.2s, #0 -; CHECK-GI-NEXT: ushll2 v20.2d, v3.4s, #0 -; CHECK-GI-NEXT: and v0.16b, v4.16b, v7.16b -; CHECK-GI-NEXT: and v1.16b, v5.16b, v7.16b -; CHECK-GI-NEXT: and v2.16b, v6.16b, v7.16b -; CHECK-GI-NEXT: and v3.16b, v16.16b, v7.16b -; CHECK-GI-NEXT: and v4.16b, v17.16b, v7.16b -; CHECK-GI-NEXT: and v5.16b, v18.16b, v7.16b -; CHECK-GI-NEXT: and v6.16b, v19.16b, v7.16b -; CHECK-GI-NEXT: and v7.16b, v20.16b, v7.16b +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v5.2s, #0 +; CHECK-GI-NEXT: ushll v5.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0 +; CHECK-GI-NEXT: ushll v16.2d, v16.2s, #0 +; CHECK-GI-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v7.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v7.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v7.16b +; CHECK-GI-NEXT: and v3.16b, v4.16b, v7.16b +; CHECK-GI-NEXT: and v4.16b, v5.16b, v7.16b +; CHECK-GI-NEXT: and v5.16b, v6.16b, v7.16b +; CHECK-GI-NEXT: and v6.16b, v16.16b, v7.16b +; CHECK-GI-NEXT: and v7.16b, v17.16b, v7.16b ; CHECK-GI-NEXT: ret entry: %c = zext <16 x i10> %a to <16 x i64> -- GitLab From 5e81437f2ba03ee0ab93b26a9654da9b95dab3b0 Mon Sep 17 00:00:00 2001 From: BrnBlrg Date: Sat, 19 Oct 2024 02:06:12 -0700 Subject: [PATCH 149/511] [analyzer][doc] Fix typo in "translation unit" in analyzer doc CommandLineUsage.rst (#112966) --- clang/docs/analyzer/user-docs/CommandLineUsage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/analyzer/user-docs/CommandLineUsage.rst b/clang/docs/analyzer/user-docs/CommandLineUsage.rst index d7f8253469df..59f8187f374a 100644 --- a/clang/docs/analyzer/user-docs/CommandLineUsage.rst +++ b/clang/docs/analyzer/user-docs/CommandLineUsage.rst @@ -2,7 +2,7 @@ Command Line Usage: scan-build and CodeChecker ============================================== This document provides guidelines for running the static analyzer from the command line on whole projects. -CodeChecker and scan-build are two CLI tools for using CSA on multiple files (tranlation units). +CodeChecker and scan-build are two CLI tools for using CSA on multiple files (translation units). Both provide a way of driving the analyzer, detecting compilation flags, and generating reports. CodeChecker is more actively maintained, provides heuristics for working with multiple versions of popular compilers and it also comes with a web-based GUI for viewing, filtering, categorizing and suppressing the results. Therefore CodeChecker is recommended in case you need any of the above features or just more customizability in general. -- GitLab From faed85b8e4961e853bfb10cd8ed1544e179ade0a Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Sat, 19 Oct 2024 10:07:57 +0100 Subject: [PATCH 150/511] [lldb][test][NFC] Document DYLIB_NAME Makefile variable (#112735) Got caught out by this because simply specifying `DYLIB_CXX_SOURCES` (without specifying `DYLIB_NAME`) resulted in linker errors because the dylib was never built (and linked). We should probably make that a Makefile error (though I haven't audited when exactly not specifying `DYLIB_NAME` is valid; looked like that can happen when we specify `FRAMEWORK`). --- lldb/packages/Python/lldbsuite/test/make/Makefile.rules | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index f81db9bc06d8..d0045ac9f91a 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -13,6 +13,13 @@ # the building of the a.out executable program. For example, # DYLIB_ONLY := YES # +# When specifying one of the DYLIB_*_SOURCES variables, DYLIB_NAME +# controls the (platform-dependent) name of the produced dylib. E.g., +# on Darwin, if "DYLIB_NAME := foo", the generated dylib will be called +# "libfoo.dylib". +# +# DYLIB_NAME := foo +# # Specifying FRAMEWORK and its variants has the effect of building a NeXT-style # framework. # FRAMEWORK := "Foo" -- GitLab From aa320600e2b7136f5156dd0c31f98ec0f8d5bce1 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Sat, 19 Oct 2024 10:08:29 +0100 Subject: [PATCH 151/511] [lldb][ClangASTImporter][NFC] Emit a log message when we break MapImported invariant (#112748) This patch emits a warning into the expression log when we call `MapImported` on a decl which has already been imported, but with a new `to` destination decl. In asserts builds this would lead to triggering this [ASTImporter::MapImported assertion](https://github.com/llvm/llvm-project/blob/6d7712a70c163d2ae9e1dc928db31fcb45d9e404/clang/lib/AST/ASTImporter.cpp#L10493-L10494). In no-asserts builds we will likely crash, in potentially non-obvious ways. The hope is that the log message will help in diagnosing this type of issue in the field. The underlying issue is discussed in more detail in: https://github.com/llvm/llvm-project/pull/112566. In a non-asserts build, the last few expression log entries would look as follows: ``` CompleteTagDecl on (ASTContext*)scratch ASTContext Completing (TagDecl*)0x00000001132d31d0 named Foo CTD Before: CXXRecordDecl 0x1132d31d0 <> struct Foo [ClangASTImporter] WARNING: overwriting an already imported decl '0x000000014378fd80' ('Foo') from '0x0000000143790c00' with 0x00000001132d31d0. Likely due to a name conflict when importing 'Foo'. [ClangASTImporter] Imported (FieldDecl*)0x0000000143790220, named service (from (Decl*)0x0000000143791270), metadata 271 [ClangASTImporter] Decl has no origin information in (ASTContext*)0x00000001132c8c00 FindExternalLexicalDecls on (ASTContext*)0x0000000143c1f600 'scratch ASTContext' in 'Foo' (CXXRecordDecl*)0x000000014378FD80 FELD Original decl (ASTContext*)0x00000001132c8c00 (Decl*)0x0000000143790c00: CXXRecordDecl 0x143790c00 <> struct Foo definition |-DefinitionData pass_in_registers aggregate standard_layout trivially_copyable pod trivial literal | |-DefaultConstructor exists trivial needs_implicit | |-CopyConstructor simple trivial has_const_param needs_implicit implicit_has_const_param | |-MoveConstructor exists simple trivial needs_implicit | |-CopyAssignment simple trivial has_const_param needs_implicit implicit_has_const_param | |-MoveAssignment exists simple trivial needs_implicit | `-Destructor simple irrelevant trivial needs_implicit |-FieldDecl 0x143791270 <> service 'Service *' `-FieldDecl 0x1437912c8 <> mach_endpoint 'int' FELD Adding [to CXXRecordDecl Foo] lexical FieldDecl FieldDecl 0x143791270 <> service 'Service *' FELD Adding [to CXXRecordDecl Foo] lexical FieldDecl FieldDecl 0x1437912c8 <> mach_endpoint 'int' [ClangASTImporter] Imported (FieldDecl*)0x0000000143790278, named mach_endpoint (from (Decl*)0x00000001437912c8), metadata 280 [ClangASTImporter] Decl has no origin information in (ASTContext*)0x00000001132c8c00 ``` Note how we start "completing" `Foo`. Then emit our new `WARNING`. Shortly after, we crash, and the log abruptly ends. rdar://135551810 --- .../Clang/ClangASTImporter.cpp | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp index 630ad7e20ab7..db9a6dd197b3 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp @@ -1136,6 +1136,29 @@ ClangASTImporter::ASTImporterDelegate::ImportImpl(Decl *From) { void ClangASTImporter::ASTImporterDelegate::ImportDefinitionTo( clang::Decl *to, clang::Decl *from) { + Log *log = GetLog(LLDBLog::Expressions); + + auto getDeclName = [](Decl const *decl) { + std::string name_string; + if (auto const *from_named_decl = dyn_cast(decl)) { + llvm::raw_string_ostream name_stream(name_string); + from_named_decl->printName(name_stream); + } + + return name_string; + }; + + if (log) { + if (auto *D = GetAlreadyImportedOrNull(from); D && D != to) { + LLDB_LOG( + log, + "[ClangASTImporter] ERROR: overwriting an already imported decl " + "'{0:x}' ('{1}') from '{2:x}' with '{3:x}'. Likely due to a name " + "conflict when importing '{1}'.", + D, getDeclName(from), from, to); + } + } + // We might have a forward declaration from a shared library that we // gave external lexical storage so that Clang asks us about the full // definition when it needs it. In this case the ASTImporter isn't aware @@ -1145,8 +1168,6 @@ void ClangASTImporter::ASTImporterDelegate::ImportDefinitionTo( // tell the ASTImporter that 'to' was imported from 'from'. MapImported(from, to); - Log *log = GetLog(LLDBLog::Expressions); - if (llvm::Error err = ImportDefinition(from)) { LLDB_LOG_ERROR(log, std::move(err), "[ClangASTImporter] Error during importing definition: {0}"); @@ -1158,18 +1179,13 @@ void ClangASTImporter::ASTImporterDelegate::ImportDefinitionTo( to_tag->setCompleteDefinition(from_tag->isCompleteDefinition()); if (Log *log_ast = GetLog(LLDBLog::AST)) { - std::string name_string; - if (NamedDecl *from_named_decl = dyn_cast(from)) { - llvm::raw_string_ostream name_stream(name_string); - from_named_decl->printName(name_stream); - } LLDB_LOG(log_ast, "==== [ClangASTImporter][TUDecl: {0:x}] Imported " "({1}Decl*){2:x}, named {3} (from " "(Decl*){4:x})", static_cast(to->getTranslationUnitDecl()), - from->getDeclKindName(), static_cast(to), name_string, - static_cast(from)); + from->getDeclKindName(), static_cast(to), + getDeclName(from), static_cast(from)); // Log the AST of the TU. std::string ast_string; -- GitLab From 1bbf3a37056761ec407031431e28f856428566f0 Mon Sep 17 00:00:00 2001 From: Hui Date: Sat, 19 Oct 2024 11:09:25 +0100 Subject: [PATCH 152/511] [libc++] Fix `reverse_iterator` when underlying is c++20 `bidirectional_iterator` but not `Cpp17BidirectionalIterator` (#112100) `reverse_iterator` supports either c++20 `bidirectional_iterator` or `Cpp17BidirectionalIterator ` http://eel.is/c++draft/reverse.iter.requirements The current `reverse_iterator` uses `std::prev` in its `operator->`, which only supports the `Cpp17BidirectionalIterator` properly. If the underlying iterator is c++20 `bidirectional_iterator` but does not satisfy the named requirement `Cpp17BidirectionalIterator`, (examples are `zip_view::iterator`, `flat_map::iterator`), the current `std::prev` silently compiles but does a no-op and returns the same iterator back. So `reverse_iterator::operator->` will silently give a wrong answer. Even if we fix the behaviour of `std::prev`, at best, we could fail to compile the code. But this is not ok, because we need to support this kind of iterators in `reverse_iterator`. The solution is simply to not use `std::prev`. --------- Co-authored-by: Louis Dionne --- libcxx/include/__iterator/reverse_iterator.h | 6 +- .../reverse.iter.cmp/equal.pass.cpp | 4 + .../reverse.iter.cmp/greater-equal.pass.cpp | 5 ++ .../reverse.iter.cmp/greater.pass.cpp | 5 ++ .../reverse.iter.cmp/less-equal.pass.cpp | 5 ++ .../reverse.iter.cmp/less.pass.cpp | 5 ++ .../reverse.iter.cmp/not-equal.pass.cpp | 4 + .../reverse.iter.cons/assign.pass.cpp | 3 + .../reverse.iter.cons/ctor.default.pass.cpp | 3 + .../reverse.iter.cons/ctor.iter.pass.cpp | 3 + .../ctor.reverse_iterator.pass.cpp | 3 + .../reverse.iter.conv/base.pass.cpp | 30 +++++--- .../reverse.iter.elem/arrow.pass.cpp | 59 +++++++++++++++ .../reverse.iter.elem/bracket.pass.cpp | 4 + .../reverse.iter.elem/dereference.pass.cpp | 5 ++ .../decrement-assign.pass.cpp | 3 + .../increment-assign.pass.cpp | 5 +- .../reverse.iter.nav/minus.pass.cpp | 5 +- .../reverse.iter.nav/plus.pass.cpp | 5 +- .../reverse.iter.nav/postdecrement.pass.cpp | 3 + .../reverse.iter.nav/postincrement.pass.cpp | 3 + .../reverse.iter.nav/predecrement.pass.cpp | 3 + .../reverse.iter.nav/preincrement.pass.cpp | 3 + .../make_reverse_iterator.pass.cpp | 34 ++++++--- .../reverse.iter.nonmember/minus.pass.cpp | 74 ++++++++++++------- .../reverse.iter.nonmember/plus.pass.cpp | 3 + 26 files changed, 229 insertions(+), 56 deletions(-) diff --git a/libcxx/include/__iterator/reverse_iterator.h b/libcxx/include/__iterator/reverse_iterator.h index 50c0f21eaa28..5e88d86ad5e9 100644 --- a/libcxx/include/__iterator/reverse_iterator.h +++ b/libcxx/include/__iterator/reverse_iterator.h @@ -136,10 +136,12 @@ public: _LIBCPP_HIDE_FROM_ABI constexpr pointer operator->() const requires is_pointer_v<_Iter> || requires(const _Iter __i) { __i.operator->(); } { + _Iter __tmp = current; + --__tmp; if constexpr (is_pointer_v<_Iter>) { - return std::prev(current); + return __tmp; } else { - return std::prev(current).operator->(); + return __tmp.operator->(); } } #else diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/equal.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/equal.pass.cpp index fcf8d88fcf62..6fe575ebdd9a 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/equal.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/equal.pass.cpp @@ -33,6 +33,10 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(bidirectional_iterator(s), bidirectional_iterator(s+1), false); test(random_access_iterator(s), random_access_iterator(s), true); test(random_access_iterator(s), random_access_iterator(s+1), false); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), true); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), false); +#endif test(s, s, true); test(s, s+1, false); return true; diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater-equal.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater-equal.pass.cpp index fdcd02abb0d8..b2bfdb56d646 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater-equal.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater-equal.pass.cpp @@ -32,6 +32,11 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(random_access_iterator(s), random_access_iterator(s), true); test(random_access_iterator(s), random_access_iterator(s+1), true); test(random_access_iterator(s+1), random_access_iterator(s), false); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), true); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), true); + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s), false); +#endif test(s, s, true); test(s, s+1, true); test(s+1, s, false); diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater.pass.cpp index dce331e51964..38f9258de31f 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/greater.pass.cpp @@ -32,6 +32,11 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(random_access_iterator(s), random_access_iterator(s), false); test(random_access_iterator(s), random_access_iterator(s+1), true); test(random_access_iterator(s+1), random_access_iterator(s), false); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), false); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), true); + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s), false); +#endif test(s, s, false); test(s, s+1, true); test(s+1, s, false); diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less-equal.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less-equal.pass.cpp index e9cea6250a76..a57930b11131 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less-equal.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less-equal.pass.cpp @@ -32,6 +32,11 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(random_access_iterator(s), random_access_iterator(s), true); test(random_access_iterator(s), random_access_iterator(s+1), false); test(random_access_iterator(s+1), random_access_iterator(s), true); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), true); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), false); + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s), true); +#endif test(s, s, true); test(s, s+1, false); test(s+1, s, true); diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less.pass.cpp index b66147cf3a03..4cd3f249d033 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/less.pass.cpp @@ -32,6 +32,11 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(random_access_iterator(s), random_access_iterator(s), false); test(random_access_iterator(s), random_access_iterator(s+1), false); test(random_access_iterator(s+1), random_access_iterator(s), true); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), false); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), false); + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s), true); +#endif test(s, s, false); test(s, s+1, false); test(s+1, s, true); diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/not-equal.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/not-equal.pass.cpp index 37a6ff1302ce..509ac297c3cb 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/not-equal.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cmp/not-equal.pass.cpp @@ -33,6 +33,10 @@ TEST_CONSTEXPR_CXX17 bool tests() { test(bidirectional_iterator(s), bidirectional_iterator(s+1), true); test(random_access_iterator(s), random_access_iterator(s), false); test(random_access_iterator(s), random_access_iterator(s+1), true); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s), false); + test(cpp20_random_access_iterator(s), cpp20_random_access_iterator(s + 1), true); +#endif test(s, s, false); test(s, s+1, true); return true; diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/assign.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/assign.pass.cpp index 0e5123a49e2b..f9d2efa7c2a8 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/assign.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/assign.pass.cpp @@ -59,6 +59,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { Derived d; test >(bidirectional_iterator(&d)); test >(random_access_iterator(&d)); +#if TEST_STD_VER >= 20 + test >(cpp20_random_access_iterator(&d)); +#endif test(&d); char c = '\0'; diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.default.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.default.pass.cpp index fcb96de91d1a..90047b19f5a6 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.default.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.default.pass.cpp @@ -26,6 +26,9 @@ TEST_CONSTEXPR_CXX17 void test() { TEST_CONSTEXPR_CXX17 bool tests() { test >(); test >(); +#if TEST_STD_VER >= 20 + test >(); +#endif test(); test(); return true; diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.iter.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.iter.pass.cpp index 801b2cf879ce..72e77b085642 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.iter.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.iter.pass.cpp @@ -28,6 +28,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char s[] = "123"; test(bidirectional_iterator(s)); test(random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s)); +#endif test(s); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.reverse_iterator.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.reverse_iterator.pass.cpp index 8f315e83f6d7..fa967b45b1d9 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.reverse_iterator.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.cons/ctor.reverse_iterator.pass.cpp @@ -33,6 +33,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { Derived d; test >(bidirectional_iterator(&d)); test >(random_access_iterator(&d)); +#if TEST_STD_VER >= 20 + test >(cpp20_random_access_iterator(&d)); +#endif test(&d); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.conv/base.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.conv/base.pass.cpp index 4fb33f542604..35ed17583c85 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.conv/base.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.conv/base.pass.cpp @@ -18,20 +18,28 @@ #include "test_macros.h" #include "test_iterators.h" -TEST_CONSTEXPR_CXX17 bool test() { - typedef bidirectional_iterator Iter; - int i = 0; - Iter iter(&i); - std::reverse_iterator const reverse(iter); - std::reverse_iterator::iterator_type base = reverse.base(); - assert(base == Iter(&i)); - return true; +template +TEST_CONSTEXPR_CXX17 void test() { + int i = 0; + Iter iter(&i); + std::reverse_iterator const reverse(iter); + typename std::reverse_iterator::iterator_type base = reverse.base(); + assert(base == Iter(&i)); +} + +TEST_CONSTEXPR_CXX17 bool tests() { + test >(); + test >(); +#if TEST_STD_VER >= 20 + test>(); +#endif + return true; } int main(int, char**) { - test(); + tests(); #if TEST_STD_VER > 14 - static_assert(test(), ""); + static_assert(tests(), ""); #endif - return 0; + return 0; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp index 15d18d9145ef..665a1a89223b 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp @@ -24,6 +24,55 @@ #include "test_macros.h" +#if TEST_STD_VER >= 20 +// C++20 bidirectional_iterator that does not satisfy the Cpp17BidirectionalIterator named requirement. +template +class cpp20_bidirectional_iterator_with_arrow { + It it_; + +public: + using iterator_category = std::input_iterator_tag; + using iterator_concept = std::bidirectional_iterator_tag; + using value_type = std::iterator_traits::value_type; + using difference_type = std::iterator_traits::difference_type; + + cpp20_bidirectional_iterator_with_arrow() : it_() {} + explicit cpp20_bidirectional_iterator_with_arrow(It it) : it_(it) {} + + decltype(auto) operator*() const { return *it_; } + + auto operator->() const { + if constexpr (std::is_pointer_v) { + return it_; + } else { + return it_.operator->(); + } + } + + cpp20_bidirectional_iterator_with_arrow& operator++() { + ++it_; + return *this; + } + cpp20_bidirectional_iterator_with_arrow& operator--() { + --it_; + return *this; + } + cpp20_bidirectional_iterator_with_arrow operator++(int) { return cpp20_bidirectional_iterator_with_arrow(it_++); } + cpp20_bidirectional_iterator_with_arrow operator--(int) { return cpp20_bidirectional_iterator_with_arrow(it_--); } + + friend bool + operator==(const cpp20_bidirectional_iterator_with_arrow& x, const cpp20_bidirectional_iterator_with_arrow& y) { + return x.it_ == y.it_; + } + friend bool + operator!=(const cpp20_bidirectional_iterator_with_arrow& x, const cpp20_bidirectional_iterator_with_arrow& y) { + return x.it_ != y.it_; + } + + friend It base(const cpp20_bidirectional_iterator_with_arrow& i) { return i.it_; } +}; +#endif + class A { int data_; @@ -113,6 +162,16 @@ int main(int, char**) static_assert(it1->get() == gC.get(), ""); } +#endif +#if TEST_STD_VER >= 20 + { + // The underlying iterator models c++20 bidirectional_iterator, + // but does not satisfy c++17 BidirectionalIterator named requirement + B data[] = {1, 2, 3}; + cpp20_bidirectional_iterator_with_arrow iter(data + 3); + auto ri = std::make_reverse_iterator(iter); + assert(ri->get() == 3); + } #endif { ((void)gC); diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/bracket.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/bracket.pass.cpp index 37a857ceefa8..8b45bfa09b4f 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/bracket.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/bracket.pass.cpp @@ -33,6 +33,10 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "1234567890"; test(random_access_iterator(s+5), 4, '1'); test(random_access_iterator(s+5), 0, '5'); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 4, '1'); + test(cpp20_random_access_iterator(s + 5), 0, '5'); +#endif test(s+5, 4, '1'); test(s+5, 0, '5'); return true; diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/dereference.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/dereference.pass.cpp index 292c6da9a773..c3a489085c68 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/dereference.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/dereference.pass.cpp @@ -21,6 +21,7 @@ #include #include "test_macros.h" +#include "test_iterators.h" class A { @@ -47,6 +48,10 @@ int main(int, char**) { A a; test(&a+1, A()); + test(random_access_iterator(&a + 1), A()); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(&a + 1), A()); +#endif #if TEST_STD_VER > 14 { diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/decrement-assign.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/decrement-assign.pass.cpp index 8c83ec1e9389..91c2d9363619 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/decrement-assign.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/decrement-assign.pass.cpp @@ -30,6 +30,9 @@ TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits::differen TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "1234567890"; test(random_access_iterator(s+5), 5, random_access_iterator(s+10)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 5, cpp20_random_access_iterator(s + 10)); +#endif test(s+5, 5, s+10); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/increment-assign.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/increment-assign.pass.cpp index e32fac9fc24f..2a2746f2cc52 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/increment-assign.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/increment-assign.pass.cpp @@ -29,7 +29,10 @@ TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits::differen TEST_CONSTEXPR_CXX17 bool tests() { char const* s = "1234567890"; - test(random_access_iterator(s+5), 5, random_access_iterator(s)); + test(random_access_iterator(s + 5), 5, random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 5, cpp20_random_access_iterator(s)); +#endif test(s+5, 5, s); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/minus.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/minus.pass.cpp index f2474dd7669f..759cacad94e2 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/minus.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/minus.pass.cpp @@ -28,7 +28,10 @@ TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits::differen TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "1234567890"; - test(random_access_iterator(s+5), 5, random_access_iterator(s+10)); + test(random_access_iterator(s + 5), 5, random_access_iterator(s + 10)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 5, cpp20_random_access_iterator(s + 10)); +#endif test(s+5, 5, s+10); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/plus.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/plus.pass.cpp index 5673425e7967..24fa84e4f37c 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/plus.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/plus.pass.cpp @@ -28,7 +28,10 @@ TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits::differen TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "1234567890"; - test(random_access_iterator(s+5), 5, random_access_iterator(s)); + test(random_access_iterator(s + 5), 5, random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 5, cpp20_random_access_iterator(s)); +#endif test(s+5, 5, s); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postdecrement.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postdecrement.pass.cpp index 24bedad314b7..f0551b5efece 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postdecrement.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postdecrement.pass.cpp @@ -30,6 +30,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "123"; test(bidirectional_iterator(s+1), bidirectional_iterator(s+2)); test(random_access_iterator(s+1), random_access_iterator(s+2)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s + 2)); +#endif test(s+1, s+2); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postincrement.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postincrement.pass.cpp index e15bfb2fd150..f1d3ea21a5b8 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postincrement.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/postincrement.pass.cpp @@ -30,6 +30,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "123"; test(bidirectional_iterator(s+1), bidirectional_iterator(s)); test(random_access_iterator(s+1), random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s)); +#endif test(s+1, s); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/predecrement.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/predecrement.pass.cpp index 2fbd530a085d..5a2ac7857036 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/predecrement.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/predecrement.pass.cpp @@ -30,6 +30,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "123"; test(bidirectional_iterator(s+1), bidirectional_iterator(s+2)); test(random_access_iterator(s+1), random_access_iterator(s+2)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s + 2)); +#endif test(s+1, s+2); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/preincrement.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/preincrement.pass.cpp index 5efc8a39e22a..6087eedd2449 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/preincrement.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nav/preincrement.pass.cpp @@ -30,6 +30,9 @@ TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "123"; test(bidirectional_iterator(s+1), bidirectional_iterator(s)); test(random_access_iterator(s+1), random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 1), cpp20_random_access_iterator(s)); +#endif test(s+1, s); return true; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/make_reverse_iterator.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/make_reverse_iterator.pass.cpp index 401eecb2a3b8..4a4e474a5508 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/make_reverse_iterator.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/make_reverse_iterator.pass.cpp @@ -22,24 +22,34 @@ #include "test_iterators.h" template -TEST_CONSTEXPR_CXX17 void test(It i) { - const std::reverse_iterator r = std::make_reverse_iterator(i); - assert(r.base() == i); +TEST_CONSTEXPR_CXX17 void test_one(It i) { + const std::reverse_iterator r = std::make_reverse_iterator(i); + assert(r.base() == i); +} + +template +TEST_CONSTEXPR_CXX17 void test() { + const char* s = "1234567890"; + It b(s); + It e(s + 10); + while (b != e) + test_one(b++); } TEST_CONSTEXPR_CXX17 bool tests() { - const char* s = "1234567890"; - random_access_iterator b(s); - random_access_iterator e(s+10); - while (b != e) - test (b++); - return true; + test(); + test>(); + test>(); +#if TEST_STD_VER >= 20 + test>(); +#endif + return true; } int main(int, char**) { - tests(); + tests(); #if TEST_STD_VER > 14 - static_assert(tests(), ""); + static_assert(tests(), ""); #endif - return 0; + return 0; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/minus.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/minus.pass.cpp index f7f74d145d73..676f6e1b4916 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/minus.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/minus.pass.cpp @@ -23,45 +23,63 @@ #include "test_macros.h" #include "test_iterators.h" -template struct HasMinus : std::false_type {}; -template struct HasMinus : std::true_type {}; +template +struct HasMinus : std::false_type {}; +template +struct HasMinus : std::true_type {}; + +// Test non-subtractable base iterator types +static_assert(HasMinus, std::reverse_iterator >::value, ""); +static_assert(HasMinus, std::reverse_iterator >::value, ""); + +#if TEST_STD_VER >= 11 +static_assert(!HasMinus, std::reverse_iterator >::value, ""); +static_assert(!HasMinus >, + std::reverse_iterator > >::value, + ""); +#endif template -TEST_CONSTEXPR_CXX17 void test(It1 l, It2 r, std::ptrdiff_t x) { - const std::reverse_iterator r1(l); - const std::reverse_iterator r2(r); - assert((r1 - r2) == x); +TEST_CONSTEXPR_CXX17 void test_one(It1 l, It2 r, std::ptrdiff_t x) { + const std::reverse_iterator r1(l); + const std::reverse_iterator r2(r); + assert((r1 - r2) == x); } -TEST_CONSTEXPR_CXX17 bool tests() { - using PC = const char*; - char s[3] = {0}; - - // Test same base iterator type - test(s, s, 0); - test(s, s+1, 1); - test(s+1, s, -1); +template +TEST_CONSTEXPR_CXX17 void test() { + // Test same base iterator type + char s[3] = {0}; - // Test different (but subtractable) base iterator types - test(PC(s), s, 0); - test(PC(s), s+1, 1); - test(PC(s+1), s, -1); + test_one(Iter(s), Iter(s), 0); + test_one(Iter(s), Iter(s + 1), 1); + test_one(Iter(s + 1), Iter(s), -1); +} - // Test non-subtractable base iterator types - static_assert( HasMinus, std::reverse_iterator >::value, ""); - static_assert( HasMinus, std::reverse_iterator >::value, ""); -#if TEST_STD_VER >= 11 - static_assert(!HasMinus, std::reverse_iterator >::value, ""); - static_assert(!HasMinus >, std::reverse_iterator > >::value, ""); +TEST_CONSTEXPR_CXX17 bool tests() { + { + test(); + test >(); +#if TEST_STD_VER >= 20 + test>(); #endif + } + { + // Test different (but subtractable) base iterator types + using PC = const char*; + char s[3] = {0}; + test_one(PC(s), s, 0); + test_one(PC(s), s + 1, 1); + test_one(PC(s + 1), s, -1); + } - return true; + return true; } int main(int, char**) { - tests(); + tests(); #if TEST_STD_VER > 14 - static_assert(tests(), ""); + static_assert(tests(), ""); #endif - return 0; + return 0; } diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/plus.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/plus.pass.cpp index aeb9f89dd487..9ead123781bc 100644 --- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/plus.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.nonmember/plus.pass.cpp @@ -29,6 +29,9 @@ TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits::differen TEST_CONSTEXPR_CXX17 bool tests() { const char* s = "1234567890"; test(random_access_iterator(s+5), 5, random_access_iterator(s)); +#if TEST_STD_VER >= 20 + test(cpp20_random_access_iterator(s + 5), 5, cpp20_random_access_iterator(s)); +#endif test(s+5, 5, s); return true; } -- GitLab From 1775b98de719299b653c12999d49ca04a9f4f65b Mon Sep 17 00:00:00 2001 From: Finlay Date: Sat, 19 Oct 2024 11:55:04 +0100 Subject: [PATCH 153/511] [mlir][spirv] Add spirv-to-llvm conversion for OpControlBarrier (#111864) The conversion is based on the expected llvm function from the LLVM/SPIRV translation tool. --- .../mlir/Dialect/SPIRV/IR/SPIRVBarrierOps.td | 2 +- .../mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td | 2 +- .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp | 70 ++++++++++++++++++- .../SPIRVToLLVM/barrier-ops-to-llvm.mlir | 23 ++++++ 4 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBarrierOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBarrierOps.td index 1ebea94fced0..145933054906 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBarrierOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBarrierOps.td @@ -54,7 +54,7 @@ def SPIRV_ControlBarrierOp : SPIRV_Op<"ControlBarrier", []> { #### Example: ```mlir - spirv.ControlBarrier "Workgroup", "Device", "Acquire|UniformMemory" + spirv.ControlBarrier , , ``` }]; diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td index 71ecabfb444b..022cbbbb6720 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMiscOps.td @@ -1,4 +1,4 @@ -//===-- SPIRVBarrierOps.td - MLIR SPIR-V Barrier Ops -------*- tablegen -*-===// +//===-- SPIRVMiscOps.td - MLIR SPIR-V Misc Ops -------------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp index 74c169c9a7e7..f28473a108e1 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp @@ -1024,6 +1024,71 @@ public: } }; +static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable, + StringRef name, + ArrayRef paramTypes, + Type resultType) { + auto func = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(symbolTable, name)); + if (func) + return func; + + OpBuilder b(symbolTable->getRegion(0)); + func = b.create( + symbolTable->getLoc(), name, + LLVM::LLVMFunctionType::get(resultType, paramTypes)); + func.setCConv(LLVM::cconv::CConv::SPIR_FUNC); + func.setConvergent(true); + func.setNoUnwind(true); + func.setWillReturn(true); + return func; +} + +static LLVM::CallOp createSPIRVBuiltinCall(Location loc, OpBuilder &builder, + LLVM::LLVMFuncOp func, + ValueRange args) { + auto call = builder.create(loc, func, args); + call.setCConv(func.getCConv()); + call.setConvergentAttr(func.getConvergentAttr()); + call.setNoUnwindAttr(func.getNoUnwindAttr()); + call.setWillReturnAttr(func.getWillReturnAttr()); + return call; +} + +class ControlBarrierPattern + : public SPIRVToLLVMConversion { +public: + using SPIRVToLLVMConversion::SPIRVToLLVMConversion; + + LogicalResult + matchAndRewrite(spirv::ControlBarrierOp controlBarrierOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + constexpr StringLiteral funcName = "_Z22__spirv_ControlBarrieriii"; + Operation *symbolTable = + controlBarrierOp->getParentWithTrait(); + + Type i32 = rewriter.getI32Type(); + + Type voidTy = rewriter.getType(); + LLVM::LLVMFuncOp func = + lookupOrCreateSPIRVFn(symbolTable, funcName, {i32, i32, i32}, voidTy); + + Location loc = controlBarrierOp->getLoc(); + Value execution = rewriter.create( + loc, i32, static_cast(adaptor.getExecutionScope())); + Value memory = rewriter.create( + loc, i32, static_cast(adaptor.getMemoryScope())); + Value semantics = rewriter.create( + loc, i32, static_cast(adaptor.getMemorySemantics())); + + auto call = createSPIRVBuiltinCall(loc, rewriter, func, + {execution, memory, semantics}); + + rewriter.replaceOp(controlBarrierOp, call); + return success(); + } +}; + /// Converts `spirv.mlir.loop` to LLVM dialect. All blocks within selection /// should be reachable for conversion to succeed. The structure of the loop in /// LLVM dialect will be the following: @@ -1648,7 +1713,10 @@ void mlir::populateSPIRVToLLVMConversionPatterns( ShiftPattern, // Return ops - ReturnPattern, ReturnValuePattern>(patterns.getContext(), typeConverter); + ReturnPattern, ReturnValuePattern, + + // Barrier ops + ControlBarrierPattern>(patterns.getContext(), typeConverter); patterns.add(clientAPI, patterns.getContext(), typeConverter); diff --git a/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir new file mode 100644 index 000000000000..d53afeeea15d --- /dev/null +++ b/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir @@ -0,0 +1,23 @@ +// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// spirv.ControlBarrierOp +//===----------------------------------------------------------------------===// + +// CHECK: llvm.func spir_funccc @_Z22__spirv_ControlBarrieriii(i32, i32, i32) attributes {convergent, no_unwind, will_return} + +// CHECK-LABEL: @control_barrier +spirv.func @control_barrier() "None" { + // CHECK: [[EXECUTION:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[MEMORY:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[SEMANTICS:%.*]] = llvm.mlir.constant(768 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[EXECUTION]], [[MEMORY]], [[SEMANTICS]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () + spirv.ControlBarrier , , + + // CHECK: [[EXECUTION:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[MEMORY:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[SEMANTICS:%.*]] = llvm.mlir.constant(256 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[EXECUTION]], [[MEMORY]], [[SEMANTICS]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () + spirv.ControlBarrier , , + spirv.Return +} -- GitLab From 8fe49b0bbef5134c87adc2719165392fca1865c3 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Sat, 19 Oct 2024 20:10:15 +0800 Subject: [PATCH 154/511] [mlir][docs] Fix name of `mlir-linalg-ods-yaml-gen`(NFC) (#113029) --- mlir/docs/Dialects/Linalg/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/docs/Dialects/Linalg/_index.md b/mlir/docs/Dialects/Linalg/_index.md index fbd1a451dc09..976f0fd3c7e9 100644 --- a/mlir/docs/Dialects/Linalg/_index.md +++ b/mlir/docs/Dialects/Linalg/_index.md @@ -667,7 +667,7 @@ directly. This facility is currently in flight and is intended to subsume the above when ready. See the C++ class to YAML mapping traits in -`mlir-mlinalg-ods-yaml-gen.cpp` as the source of truth for the schema. +`mlir-linalg-ods-yaml-gen.cpp` as the source of truth for the schema. Most of the above documentation roughly applies to this path and will be ported as migration continues. -- GitLab From 5785cbb40570c3847aa994b2d2b7e03321eee7eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Sat, 19 Oct 2024 15:13:15 +0200 Subject: [PATCH 155/511] [llvm] Ensure that soft float targets don't emit `fma()` libcalls. (#106615) The previous behavior could be harmful in some edge cases, such as emitting a call to `fma()` in the `fma()` implementation itself. Do this by just being more accurate in `isFMAFasterThanFMulAndFAdd()`. This was already done for PowerPC; this commit just extends that to Arm, z/Arch, and x86. MIPS and SPARC already got it right, but I added tests for them too, for good measure. Note: I don't have commit access. --- llvm/include/llvm/CodeGen/TargetLowering.h | 3 + llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 + .../Target/SystemZ/SystemZISelLowering.cpp | 3 + llvm/lib/Target/X86/X86ISelLowering.cpp | 3 + llvm/test/CodeGen/ARM/fmuladd-soft-float.ll | 406 ++++ llvm/test/CodeGen/Mips/fmuladd-soft-float.ll | 932 +++++++++ llvm/test/CodeGen/SPARC/fmuladd-soft-float.ll | 385 ++++ .../CodeGen/SystemZ/fmuladd-soft-float.ll | 230 +++ llvm/test/CodeGen/X86/fmuladd-soft-float.ll | 1777 +++++++++++++++++ 9 files changed, 3742 insertions(+) create mode 100644 llvm/test/CodeGen/ARM/fmuladd-soft-float.ll create mode 100644 llvm/test/CodeGen/Mips/fmuladd-soft-float.ll create mode 100644 llvm/test/CodeGen/SPARC/fmuladd-soft-float.ll create mode 100644 llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll create mode 100644 llvm/test/CodeGen/X86/fmuladd-soft-float.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 61615cb0f7b3..8e0cdc6f1a5e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3223,6 +3223,9 @@ public: /// not legal, but should return true if those types will eventually legalize /// to types that support FMAs. After legalization, it will only be called on /// types that support FMAs (via Legal or Custom actions) + /// + /// Targets that care about soft float support should return false when soft + /// float code is being generated (i.e. use-soft-float). virtual bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT) const { return false; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5d679a1a916d..a4f01e55f53c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19354,6 +19354,9 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { /// patterns (and we don't have the non-fused floating point instruction). bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { + if (Subtarget->useSoftFloat()) + return false; + if (!VT.isSimple()) return false; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 83417e570dab..3e05f3b0180a 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -793,6 +793,9 @@ EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL, bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd( const MachineFunction &MF, EVT VT) const { + if (useSoftFloat()) + return false; + VT = VT.getScalarType(); if (!VT.isSimple()) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9d143256de1e..bcb84add65d8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34838,6 +34838,9 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { + if (Subtarget.useSoftFloat()) + return false; + if (!Subtarget.hasAnyFMA()) return false; diff --git a/llvm/test/CodeGen/ARM/fmuladd-soft-float.ll b/llvm/test/CodeGen/ARM/fmuladd-soft-float.ll new file mode 100644 index 000000000000..88c31325b64b --- /dev/null +++ b/llvm/test/CodeGen/ARM/fmuladd-soft-float.ll @@ -0,0 +1,406 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=arm < %s | FileCheck %s -check-prefix=SOFT-FLOAT +; RUN: llc -mtriple=arm -mattr=+vfp4d16sp < %s | FileCheck %s -check-prefix=SOFT-FLOAT-VFP32 +; RUN: llc -mtriple=arm -mattr=+vfp4d16sp,+fp64 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-VFP64 + +define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r4, lr} +; SOFT-FLOAT-NEXT: mov r4, r2 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: mov r1, r4 +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: pop {r4, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r4, lr} +; SOFT-FLOAT-VFP32-NEXT: mov r4, r2 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: mov r1, r4 +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: pop {r4, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r4, lr} +; SOFT-FLOAT-VFP64-NEXT: mov r4, r2 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: mov r1, r4 +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: pop {r4, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %result = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %result +} + +define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r11, lr} +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: pop {r11, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r11, lr} +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: pop {r11, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r11, lr} +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: pop {r11, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %result = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %result +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r4, lr} +; SOFT-FLOAT-NEXT: mov r4, r2 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: mov r1, r4 +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: pop {r4, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r4, lr} +; SOFT-FLOAT-VFP32-NEXT: mov r4, r2 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: mov r1, r4 +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: pop {r4, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r4, lr} +; SOFT-FLOAT-VFP64-NEXT: mov r4, r2 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: mov r1, r4 +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: pop {r4, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %product = fmul contract float %a, %b + %result = fadd contract float %product, %c + ret float %result +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r11, lr} +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: pop {r11, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r11, lr} +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: pop {r11, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r11, lr} +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #8] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #12] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: pop {r11, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %product = fmul contract double %a, %b + %result = fadd contract double %product, %c + ret double %result +} + +define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-NEXT: mov r7, r1 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #24] +; SOFT-FLOAT-NEXT: mov r4, r3 +; SOFT-FLOAT-NEXT: mov r6, r2 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #40] +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-NEXT: mov r5, r0 +; SOFT-FLOAT-NEXT: mov r0, r7 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #44] +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #32] +; SOFT-FLOAT-NEXT: mov r7, r0 +; SOFT-FLOAT-NEXT: mov r0, r6 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #48] +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-NEXT: mov r6, r0 +; SOFT-FLOAT-NEXT: mov r0, r4 +; SOFT-FLOAT-NEXT: bl __mulsf3 +; SOFT-FLOAT-NEXT: ldr r1, [sp, #52] +; SOFT-FLOAT-NEXT: bl __addsf3 +; SOFT-FLOAT-NEXT: mov r3, r0 +; SOFT-FLOAT-NEXT: mov r0, r5 +; SOFT-FLOAT-NEXT: mov r1, r7 +; SOFT-FLOAT-NEXT: mov r2, r6 +; SOFT-FLOAT-NEXT: pop {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-VFP32-NEXT: mov r7, r1 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #24] +; SOFT-FLOAT-VFP32-NEXT: mov r4, r3 +; SOFT-FLOAT-VFP32-NEXT: mov r6, r2 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #40] +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-VFP32-NEXT: mov r5, r0 +; SOFT-FLOAT-VFP32-NEXT: mov r0, r7 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #44] +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #32] +; SOFT-FLOAT-VFP32-NEXT: mov r7, r0 +; SOFT-FLOAT-VFP32-NEXT: mov r0, r6 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #48] +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-VFP32-NEXT: mov r6, r0 +; SOFT-FLOAT-VFP32-NEXT: mov r0, r4 +; SOFT-FLOAT-VFP32-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #52] +; SOFT-FLOAT-VFP32-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP32-NEXT: mov r3, r0 +; SOFT-FLOAT-VFP32-NEXT: mov r0, r5 +; SOFT-FLOAT-VFP32-NEXT: mov r1, r7 +; SOFT-FLOAT-VFP32-NEXT: mov r2, r6 +; SOFT-FLOAT-VFP32-NEXT: pop {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-VFP64-NEXT: mov r7, r1 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #24] +; SOFT-FLOAT-VFP64-NEXT: mov r4, r3 +; SOFT-FLOAT-VFP64-NEXT: mov r6, r2 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #40] +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-VFP64-NEXT: mov r5, r0 +; SOFT-FLOAT-VFP64-NEXT: mov r0, r7 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #44] +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #32] +; SOFT-FLOAT-VFP64-NEXT: mov r7, r0 +; SOFT-FLOAT-VFP64-NEXT: mov r0, r6 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #48] +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-VFP64-NEXT: mov r6, r0 +; SOFT-FLOAT-VFP64-NEXT: mov r0, r4 +; SOFT-FLOAT-VFP64-NEXT: bl __mulsf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #52] +; SOFT-FLOAT-VFP64-NEXT: bl __addsf3 +; SOFT-FLOAT-VFP64-NEXT: mov r3, r0 +; SOFT-FLOAT-VFP64-NEXT: mov r0, r5 +; SOFT-FLOAT-VFP64-NEXT: mov r1, r7 +; SOFT-FLOAT-VFP64-NEXT: mov r2, r6 +; SOFT-FLOAT-VFP64-NEXT: pop {r4, r5, r6, r7, r11, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %product = fmul contract <4 x float> %a, %b + %result = fadd contract <4 x float> %product, %c + ret <4 x float> %result +} + +define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT: @ %bb.0: +; SOFT-FLOAT-NEXT: push {r4, r5, r6, lr} +; SOFT-FLOAT-NEXT: mov r5, r3 +; SOFT-FLOAT-NEXT: mov r6, r2 +; SOFT-FLOAT-NEXT: mov r4, r0 +; SOFT-FLOAT-NEXT: ldr r0, [sp, #32] +; SOFT-FLOAT-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-NEXT: ldr r2, [sp, #64] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #68] +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #96] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #100] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: str r0, [r4, #24] +; SOFT-FLOAT-NEXT: str r1, [r4, #28] +; SOFT-FLOAT-NEXT: ldr r0, [sp, #24] +; SOFT-FLOAT-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-NEXT: ldr r2, [sp, #56] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #60] +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #88] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #92] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: str r0, [r4, #16] +; SOFT-FLOAT-NEXT: str r1, [r4, #20] +; SOFT-FLOAT-NEXT: ldr r0, [sp, #16] +; SOFT-FLOAT-NEXT: ldr r1, [sp, #20] +; SOFT-FLOAT-NEXT: ldr r2, [sp, #48] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #52] +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #80] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #84] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #40] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #44] +; SOFT-FLOAT-NEXT: str r0, [r4, #8] +; SOFT-FLOAT-NEXT: mov r0, r6 +; SOFT-FLOAT-NEXT: str r1, [r4, #12] +; SOFT-FLOAT-NEXT: mov r1, r5 +; SOFT-FLOAT-NEXT: bl __muldf3 +; SOFT-FLOAT-NEXT: ldr r2, [sp, #72] +; SOFT-FLOAT-NEXT: ldr r3, [sp, #76] +; SOFT-FLOAT-NEXT: bl __adddf3 +; SOFT-FLOAT-NEXT: stm r4, {r0, r1} +; SOFT-FLOAT-NEXT: pop {r4, r5, r6, lr} +; SOFT-FLOAT-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP32-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-VFP32: @ %bb.0: +; SOFT-FLOAT-VFP32-NEXT: push {r4, r5, r6, lr} +; SOFT-FLOAT-VFP32-NEXT: mov r5, r3 +; SOFT-FLOAT-VFP32-NEXT: mov r6, r2 +; SOFT-FLOAT-VFP32-NEXT: mov r4, r0 +; SOFT-FLOAT-VFP32-NEXT: ldr r0, [sp, #32] +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #64] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #68] +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #96] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #100] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: str r0, [r4, #24] +; SOFT-FLOAT-VFP32-NEXT: str r1, [r4, #28] +; SOFT-FLOAT-VFP32-NEXT: ldr r0, [sp, #24] +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #56] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #60] +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #88] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #92] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: str r0, [r4, #16] +; SOFT-FLOAT-VFP32-NEXT: str r1, [r4, #20] +; SOFT-FLOAT-VFP32-NEXT: ldr r0, [sp, #16] +; SOFT-FLOAT-VFP32-NEXT: ldr r1, [sp, #20] +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #48] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #52] +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #80] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #84] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #40] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #44] +; SOFT-FLOAT-VFP32-NEXT: str r0, [r4, #8] +; SOFT-FLOAT-VFP32-NEXT: mov r0, r6 +; SOFT-FLOAT-VFP32-NEXT: str r1, [r4, #12] +; SOFT-FLOAT-VFP32-NEXT: mov r1, r5 +; SOFT-FLOAT-VFP32-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP32-NEXT: ldr r2, [sp, #72] +; SOFT-FLOAT-VFP32-NEXT: ldr r3, [sp, #76] +; SOFT-FLOAT-VFP32-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP32-NEXT: stm r4, {r0, r1} +; SOFT-FLOAT-VFP32-NEXT: pop {r4, r5, r6, lr} +; SOFT-FLOAT-VFP32-NEXT: mov pc, lr +; +; SOFT-FLOAT-VFP64-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-VFP64: @ %bb.0: +; SOFT-FLOAT-VFP64-NEXT: push {r4, r5, r6, lr} +; SOFT-FLOAT-VFP64-NEXT: mov r5, r3 +; SOFT-FLOAT-VFP64-NEXT: mov r6, r2 +; SOFT-FLOAT-VFP64-NEXT: mov r4, r0 +; SOFT-FLOAT-VFP64-NEXT: ldr r0, [sp, #32] +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #36] +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #64] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #68] +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #96] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #100] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: str r0, [r4, #24] +; SOFT-FLOAT-VFP64-NEXT: str r1, [r4, #28] +; SOFT-FLOAT-VFP64-NEXT: ldr r0, [sp, #24] +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #28] +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #56] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #60] +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #88] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #92] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: str r0, [r4, #16] +; SOFT-FLOAT-VFP64-NEXT: str r1, [r4, #20] +; SOFT-FLOAT-VFP64-NEXT: ldr r0, [sp, #16] +; SOFT-FLOAT-VFP64-NEXT: ldr r1, [sp, #20] +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #48] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #52] +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #80] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #84] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #40] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #44] +; SOFT-FLOAT-VFP64-NEXT: str r0, [r4, #8] +; SOFT-FLOAT-VFP64-NEXT: mov r0, r6 +; SOFT-FLOAT-VFP64-NEXT: str r1, [r4, #12] +; SOFT-FLOAT-VFP64-NEXT: mov r1, r5 +; SOFT-FLOAT-VFP64-NEXT: bl __muldf3 +; SOFT-FLOAT-VFP64-NEXT: ldr r2, [sp, #72] +; SOFT-FLOAT-VFP64-NEXT: ldr r3, [sp, #76] +; SOFT-FLOAT-VFP64-NEXT: bl __adddf3 +; SOFT-FLOAT-VFP64-NEXT: stm r4, {r0, r1} +; SOFT-FLOAT-VFP64-NEXT: pop {r4, r5, r6, lr} +; SOFT-FLOAT-VFP64-NEXT: mov pc, lr + %product = fmul contract <4 x double> %a, %b + %result = fadd contract <4 x double> %product, %c + ret <4 x double> %result +} + +attributes #0 = { "use-soft-float"="true" } + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare double @llvm.fmuladd.f64(double %a, double %b, double %c) diff --git a/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll b/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll new file mode 100644 index 000000000000..bbfb7cf9ca90 --- /dev/null +++ b/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll @@ -0,0 +1,932 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=mips < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32 +; RUN: llc -mtriple=mips -mcpu mips32r2 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32R2 +; RUN: llc -mtriple=mips64 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64 +; RUN: llc -mtriple=mips64 -mcpu mips64r2 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64R2 + +define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: .cfi_offset 16, -8 +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $16, $6 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: move $5, $16 +; SOFT-FLOAT-32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 16, -8 +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $16, $6 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $16 +; SOFT-FLOAT-32R2-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $5, 0 +; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 16 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $5, 0 +; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 16 + %result = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %result +} + +define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: nop +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $6, 40($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 44($sp) +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: nop +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $6, 40($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 44($sp) +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: move $5, $16 +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 16 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $16 +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 16 + %result = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %result +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: .cfi_offset 16, -8 +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $16, $6 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: move $5, $16 +; SOFT-FLOAT-32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 16, -8 +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $16, $6 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $16 +; SOFT-FLOAT-32R2-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $5, 0 +; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 16 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $5, 0 +; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 16 + %product = fmul contract float %a, %b + %result = fadd contract float %product, %c + ret float %result +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: nop +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $6, 40($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 44($sp) +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: nop +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $6, 40($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 44($sp) +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 24 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: move $5, $16 +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 16 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $16 +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 16 + %product = fmul contract double %a, %b + %result = fadd contract double %product, %c + ret double %result +} + +define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -48 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-32-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $21, 40($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $20, 36($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $19, 32($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $18, 28($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $17, 24($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: .cfi_offset 21, -8 +; SOFT-FLOAT-32-NEXT: .cfi_offset 20, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset 19, -16 +; SOFT-FLOAT-32-NEXT: .cfi_offset 18, -20 +; SOFT-FLOAT-32-NEXT: .cfi_offset 17, -24 +; SOFT-FLOAT-32-NEXT: .cfi_offset 16, -28 +; SOFT-FLOAT-32-NEXT: move $17, $7 +; SOFT-FLOAT-32-NEXT: move $16, $4 +; SOFT-FLOAT-32-NEXT: lw $4, 64($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 80($sp) +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $18, $6 +; SOFT-FLOAT-32-NEXT: lw $5, 96($sp) +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $4, 68($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 84($sp) +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $19, $2 +; SOFT-FLOAT-32-NEXT: lw $5, 100($sp) +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: move $20, $2 +; SOFT-FLOAT-32-NEXT: lw $5, 76($sp) +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $4, $17 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $17, 88($sp) +; SOFT-FLOAT-32-NEXT: lw $21, 72($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 92($sp) +; SOFT-FLOAT-32-NEXT: sw $20, 12($16) +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: sw $19, 8($16) +; SOFT-FLOAT-32-NEXT: sw $2, 4($16) +; SOFT-FLOAT-32-NEXT: move $4, $18 +; SOFT-FLOAT-32-NEXT: jal __mulsf3 +; SOFT-FLOAT-32-NEXT: move $5, $21 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: jal __addsf3 +; SOFT-FLOAT-32-NEXT: move $5, $17 +; SOFT-FLOAT-32-NEXT: sw $2, 0($16) +; SOFT-FLOAT-32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $17, 24($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $18, 28($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $19, 32($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $20, 36($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $21, 40($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 48 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -48 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $21, 40($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $20, 36($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $19, 32($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $18, 28($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $17, 24($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $16, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 21, -8 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 20, -12 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 19, -16 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 18, -20 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 17, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 16, -28 +; SOFT-FLOAT-32R2-NEXT: move $17, $7 +; SOFT-FLOAT-32R2-NEXT: move $16, $4 +; SOFT-FLOAT-32R2-NEXT: lw $4, 64($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 80($sp) +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $18, $6 +; SOFT-FLOAT-32R2-NEXT: lw $5, 96($sp) +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $4, 68($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 84($sp) +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $19, $2 +; SOFT-FLOAT-32R2-NEXT: lw $5, 100($sp) +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: move $20, $2 +; SOFT-FLOAT-32R2-NEXT: lw $5, 76($sp) +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $4, $17 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $17, 88($sp) +; SOFT-FLOAT-32R2-NEXT: lw $21, 72($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 92($sp) +; SOFT-FLOAT-32R2-NEXT: sw $20, 12($16) +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: sw $19, 8($16) +; SOFT-FLOAT-32R2-NEXT: sw $2, 4($16) +; SOFT-FLOAT-32R2-NEXT: move $4, $18 +; SOFT-FLOAT-32R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $21 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: jal __addsf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $17 +; SOFT-FLOAT-32R2-NEXT: sw $2, 0($16) +; SOFT-FLOAT-32R2-NEXT: lw $16, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $17, 24($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $18, 28($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $19, 32($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $20, 36($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $21, 40($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 48 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -64 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $22, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $21, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $20, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $19, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $18, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $17, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-64-NEXT: .cfi_offset 21, -24 +; SOFT-FLOAT-64-NEXT: .cfi_offset 20, -32 +; SOFT-FLOAT-64-NEXT: .cfi_offset 19, -40 +; SOFT-FLOAT-64-NEXT: .cfi_offset 18, -48 +; SOFT-FLOAT-64-NEXT: .cfi_offset 17, -56 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64-NEXT: move $16, $9 +; SOFT-FLOAT-64-NEXT: move $17, $8 +; SOFT-FLOAT-64-NEXT: move $18, $7 +; SOFT-FLOAT-64-NEXT: move $19, $6 +; SOFT-FLOAT-64-NEXT: move $20, $5 +; SOFT-FLOAT-64-NEXT: move $21, $4 +; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $6, 0 +; SOFT-FLOAT-64-NEXT: move $22, $2 +; SOFT-FLOAT-64-NEXT: dsra $4, $21, 32 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: dsra $5, $19, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: dsra $5, $17, 32 +; SOFT-FLOAT-64-NEXT: # kill: def $v0 killed $v0 def $v0_64 +; SOFT-FLOAT-64-NEXT: sll $4, $22, 0 +; SOFT-FLOAT-64-NEXT: sll $5, $17, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: dsll $17, $2, 32 +; SOFT-FLOAT-64-NEXT: dsll $1, $2, 32 +; SOFT-FLOAT-64-NEXT: dsrl $1, $1, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $20, 0 +; SOFT-FLOAT-64-NEXT: sll $5, $18, 0 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: or $17, $1, $17 +; SOFT-FLOAT-64-NEXT: move $19, $2 +; SOFT-FLOAT-64-NEXT: dsra $4, $20, 32 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: dsra $5, $18, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: dsra $5, $16, 32 +; SOFT-FLOAT-64-NEXT: # kill: def $v0 killed $v0 def $v0_64 +; SOFT-FLOAT-64-NEXT: dsll $18, $2, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $19, 0 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: dsll $1, $2, 32 +; SOFT-FLOAT-64-NEXT: dsrl $1, $1, 32 +; SOFT-FLOAT-64-NEXT: or $3, $1, $18 +; SOFT-FLOAT-64-NEXT: move $2, $17 +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $18, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $19, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $20, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $21, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $22, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 64 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -64 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $22, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $21, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $20, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $19, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $18, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $17, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 21, -24 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 20, -32 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 19, -40 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 18, -48 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 17, -56 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64R2-NEXT: move $16, $9 +; SOFT-FLOAT-64R2-NEXT: move $17, $8 +; SOFT-FLOAT-64R2-NEXT: move $18, $7 +; SOFT-FLOAT-64R2-NEXT: move $19, $6 +; SOFT-FLOAT-64R2-NEXT: move $20, $5 +; SOFT-FLOAT-64R2-NEXT: move $21, $4 +; SOFT-FLOAT-64R2-NEXT: dsra $4, $4, 32 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: dsra $5, $6, 32 +; SOFT-FLOAT-64R2-NEXT: move $22, $2 +; SOFT-FLOAT-64R2-NEXT: sll $4, $21, 0 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $19, 0 +; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $17, 0 +; SOFT-FLOAT-64R2-NEXT: sll $4, $22, 0 +; SOFT-FLOAT-64R2-NEXT: dsra $5, $17, 32 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: dext $17, $2, 0, 32 +; SOFT-FLOAT-64R2-NEXT: # kill: def $v0 killed $v0 def $v0_64 +; SOFT-FLOAT-64R2-NEXT: dsll $1, $2, 32 +; SOFT-FLOAT-64R2-NEXT: dsra $4, $20, 32 +; SOFT-FLOAT-64R2-NEXT: dsra $5, $18, 32 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: or $17, $17, $1 +; SOFT-FLOAT-64R2-NEXT: move $19, $2 +; SOFT-FLOAT-64R2-NEXT: sll $4, $20, 0 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $18, 0 +; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64R2-NEXT: dext $18, $2, 0, 32 +; SOFT-FLOAT-64R2-NEXT: sll $4, $19, 0 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: dsra $5, $16, 32 +; SOFT-FLOAT-64R2-NEXT: # kill: def $v0 killed $v0 def $v0_64 +; SOFT-FLOAT-64R2-NEXT: dsll $1, $2, 32 +; SOFT-FLOAT-64R2-NEXT: or $3, $18, $1 +; SOFT-FLOAT-64R2-NEXT: move $2, $17 +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $17, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $18, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $19, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $20, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $21, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $22, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 64 + %product = fmul contract <4 x float> %a, %b + %result = fadd contract <4 x float> %product, %c + ret <4 x float> %result +} + +define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, -64 +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-32-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $23, 52($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $22, 48($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $21, 44($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $20, 40($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $19, 36($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $18, 32($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $17, 28($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $16, 24($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32-NEXT: .cfi_offset 30, -8 +; SOFT-FLOAT-32-NEXT: .cfi_offset 23, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-32-NEXT: .cfi_offset 21, -20 +; SOFT-FLOAT-32-NEXT: .cfi_offset 20, -24 +; SOFT-FLOAT-32-NEXT: .cfi_offset 19, -28 +; SOFT-FLOAT-32-NEXT: .cfi_offset 18, -32 +; SOFT-FLOAT-32-NEXT: .cfi_offset 17, -36 +; SOFT-FLOAT-32-NEXT: .cfi_offset 16, -40 +; SOFT-FLOAT-32-NEXT: sw $7, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: sw $6, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: move $16, $4 +; SOFT-FLOAT-32-NEXT: lw $4, 88($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 92($sp) +; SOFT-FLOAT-32-NEXT: lw $6, 120($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 124($sp) +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: nop +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $6, 152($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 156($sp) +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: move $19, $2 +; SOFT-FLOAT-32-NEXT: lw $4, 96($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 100($sp) +; SOFT-FLOAT-32-NEXT: lw $6, 128($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 132($sp) +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: move $20, $3 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: lw $6, 160($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 164($sp) +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: move $21, $2 +; SOFT-FLOAT-32-NEXT: lw $4, 80($sp) +; SOFT-FLOAT-32-NEXT: lw $5, 84($sp) +; SOFT-FLOAT-32-NEXT: lw $6, 112($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 116($sp) +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: move $22, $3 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: lw $23, 140($sp) +; SOFT-FLOAT-32-NEXT: lw $fp, 136($sp) +; SOFT-FLOAT-32-NEXT: lw $17, 108($sp) +; SOFT-FLOAT-32-NEXT: lw $18, 104($sp) +; SOFT-FLOAT-32-NEXT: lw $7, 148($sp) +; SOFT-FLOAT-32-NEXT: lw $6, 144($sp) +; SOFT-FLOAT-32-NEXT: sw $22, 28($16) +; SOFT-FLOAT-32-NEXT: sw $21, 24($16) +; SOFT-FLOAT-32-NEXT: sw $20, 20($16) +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: sw $19, 16($16) +; SOFT-FLOAT-32-NEXT: sw $3, 12($16) +; SOFT-FLOAT-32-NEXT: sw $2, 8($16) +; SOFT-FLOAT-32-NEXT: lw $4, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $5, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: move $6, $18 +; SOFT-FLOAT-32-NEXT: jal __muldf3 +; SOFT-FLOAT-32-NEXT: move $7, $17 +; SOFT-FLOAT-32-NEXT: move $4, $2 +; SOFT-FLOAT-32-NEXT: move $5, $3 +; SOFT-FLOAT-32-NEXT: move $6, $fp +; SOFT-FLOAT-32-NEXT: jal __adddf3 +; SOFT-FLOAT-32-NEXT: move $7, $23 +; SOFT-FLOAT-32-NEXT: sw $3, 4($16) +; SOFT-FLOAT-32-NEXT: sw $2, 0($16) +; SOFT-FLOAT-32-NEXT: lw $16, 24($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $17, 28($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $18, 32($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $19, 36($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $20, 40($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $21, 44($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $22, 48($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $23, 52($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: jr $ra +; SOFT-FLOAT-32-NEXT: addiu $sp, $sp, 64 +; +; SOFT-FLOAT-32R2-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32R2: # %bb.0: +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, -64 +; SOFT-FLOAT-32R2-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-32R2-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $23, 52($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $22, 48($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $21, 44($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $20, 40($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $19, 36($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $18, 32($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $17, 28($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $16, 24($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 31, -4 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 30, -8 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 23, -12 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 21, -20 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 20, -24 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 19, -28 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 18, -32 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 17, -36 +; SOFT-FLOAT-32R2-NEXT: .cfi_offset 16, -40 +; SOFT-FLOAT-32R2-NEXT: sw $7, 20($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: sw $6, 16($sp) # 4-byte Folded Spill +; SOFT-FLOAT-32R2-NEXT: move $16, $4 +; SOFT-FLOAT-32R2-NEXT: lw $4, 88($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 92($sp) +; SOFT-FLOAT-32R2-NEXT: lw $6, 120($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 124($sp) +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: nop +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $6, 152($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 156($sp) +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: move $19, $2 +; SOFT-FLOAT-32R2-NEXT: lw $4, 96($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 100($sp) +; SOFT-FLOAT-32R2-NEXT: lw $6, 128($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 132($sp) +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: move $20, $3 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: lw $6, 160($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 164($sp) +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: move $21, $2 +; SOFT-FLOAT-32R2-NEXT: lw $4, 80($sp) +; SOFT-FLOAT-32R2-NEXT: lw $5, 84($sp) +; SOFT-FLOAT-32R2-NEXT: lw $6, 112($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 116($sp) +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: move $22, $3 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: lw $23, 140($sp) +; SOFT-FLOAT-32R2-NEXT: lw $fp, 136($sp) +; SOFT-FLOAT-32R2-NEXT: lw $17, 108($sp) +; SOFT-FLOAT-32R2-NEXT: lw $18, 104($sp) +; SOFT-FLOAT-32R2-NEXT: lw $7, 148($sp) +; SOFT-FLOAT-32R2-NEXT: lw $6, 144($sp) +; SOFT-FLOAT-32R2-NEXT: sw $22, 28($16) +; SOFT-FLOAT-32R2-NEXT: sw $21, 24($16) +; SOFT-FLOAT-32R2-NEXT: sw $20, 20($16) +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: sw $19, 16($16) +; SOFT-FLOAT-32R2-NEXT: sw $3, 12($16) +; SOFT-FLOAT-32R2-NEXT: sw $2, 8($16) +; SOFT-FLOAT-32R2-NEXT: lw $4, 16($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $5, 20($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: move $6, $18 +; SOFT-FLOAT-32R2-NEXT: jal __muldf3 +; SOFT-FLOAT-32R2-NEXT: move $7, $17 +; SOFT-FLOAT-32R2-NEXT: move $4, $2 +; SOFT-FLOAT-32R2-NEXT: move $5, $3 +; SOFT-FLOAT-32R2-NEXT: move $6, $fp +; SOFT-FLOAT-32R2-NEXT: jal __adddf3 +; SOFT-FLOAT-32R2-NEXT: move $7, $23 +; SOFT-FLOAT-32R2-NEXT: sw $3, 4($16) +; SOFT-FLOAT-32R2-NEXT: sw $2, 0($16) +; SOFT-FLOAT-32R2-NEXT: lw $16, 24($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $17, 28($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $18, 32($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $19, 36($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $20, 40($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $21, 44($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $22, 48($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $23, 52($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; SOFT-FLOAT-32R2-NEXT: jr $ra +; SOFT-FLOAT-32R2-NEXT: addiu $sp, $sp, 64 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -64 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $22, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $21, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $20, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $19, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $18, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $17, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-64-NEXT: .cfi_offset 21, -24 +; SOFT-FLOAT-64-NEXT: .cfi_offset 20, -32 +; SOFT-FLOAT-64-NEXT: .cfi_offset 19, -40 +; SOFT-FLOAT-64-NEXT: .cfi_offset 18, -48 +; SOFT-FLOAT-64-NEXT: .cfi_offset 17, -56 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64-NEXT: move $17, $10 +; SOFT-FLOAT-64-NEXT: move $18, $9 +; SOFT-FLOAT-64-NEXT: move $19, $8 +; SOFT-FLOAT-64-NEXT: move $20, $6 +; SOFT-FLOAT-64-NEXT: move $21, $5 +; SOFT-FLOAT-64-NEXT: move $16, $4 +; SOFT-FLOAT-64-NEXT: move $4, $7 +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $5, $11 +; SOFT-FLOAT-64-NEXT: ld $5, 88($sp) +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: move $22, $2 +; SOFT-FLOAT-64-NEXT: ld $5, 64($sp) +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $4, $19 +; SOFT-FLOAT-64-NEXT: ld $5, 96($sp) +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: move $19, $2 +; SOFT-FLOAT-64-NEXT: move $4, $20 +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $5, $17 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: ld $17, 72($sp) +; SOFT-FLOAT-64-NEXT: ld $5, 80($sp) +; SOFT-FLOAT-64-NEXT: sd $19, 24($16) +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: sd $22, 16($16) +; SOFT-FLOAT-64-NEXT: sd $2, 8($16) +; SOFT-FLOAT-64-NEXT: move $4, $21 +; SOFT-FLOAT-64-NEXT: jal __muldf3 +; SOFT-FLOAT-64-NEXT: move $5, $18 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: jal __adddf3 +; SOFT-FLOAT-64-NEXT: move $5, $17 +; SOFT-FLOAT-64-NEXT: sd $2, 0($16) +; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $18, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $19, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $20, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $21, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $22, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: jr $ra +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 64 +; +; SOFT-FLOAT-64R2-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64R2: # %bb.0: +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -64 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $22, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $21, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $20, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $19, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $18, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $17, 8($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 22, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 21, -24 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 20, -32 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 19, -40 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 18, -48 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 17, -56 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64R2-NEXT: move $17, $10 +; SOFT-FLOAT-64R2-NEXT: move $18, $9 +; SOFT-FLOAT-64R2-NEXT: move $19, $8 +; SOFT-FLOAT-64R2-NEXT: move $20, $6 +; SOFT-FLOAT-64R2-NEXT: move $21, $5 +; SOFT-FLOAT-64R2-NEXT: move $16, $4 +; SOFT-FLOAT-64R2-NEXT: move $4, $7 +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $11 +; SOFT-FLOAT-64R2-NEXT: ld $5, 88($sp) +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: move $22, $2 +; SOFT-FLOAT-64R2-NEXT: ld $5, 64($sp) +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $4, $19 +; SOFT-FLOAT-64R2-NEXT: ld $5, 96($sp) +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: move $19, $2 +; SOFT-FLOAT-64R2-NEXT: move $4, $20 +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $17 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: ld $17, 72($sp) +; SOFT-FLOAT-64R2-NEXT: ld $5, 80($sp) +; SOFT-FLOAT-64R2-NEXT: sd $19, 24($16) +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: sd $22, 16($16) +; SOFT-FLOAT-64R2-NEXT: sd $2, 8($16) +; SOFT-FLOAT-64R2-NEXT: move $4, $21 +; SOFT-FLOAT-64R2-NEXT: jal __muldf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $18 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: jal __adddf3 +; SOFT-FLOAT-64R2-NEXT: move $5, $17 +; SOFT-FLOAT-64R2-NEXT: sd $2, 0($16) +; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $17, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $18, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $19, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $20, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $21, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $22, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: jr $ra +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 64 + %product = fmul contract <4 x double> %a, %b + %result = fadd contract <4 x double> %product, %c + ret <4 x double> %result +} + +attributes #0 = { "use-soft-float"="true" } + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare double @llvm.fmuladd.f64(double %a, double %b, double %c) diff --git a/llvm/test/CodeGen/SPARC/fmuladd-soft-float.ll b/llvm/test/CodeGen/SPARC/fmuladd-soft-float.ll new file mode 100644 index 000000000000..a9e666e3c9b4 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/fmuladd-soft-float.ll @@ -0,0 +1,385 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=sparc < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32 +; RUN: llc -mtriple=sparc64 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64 + +define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -96, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %i1, %o1 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %i2, %o1 +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore %g0, %o0, %o0 +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: srl %i0, 0, %o0 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: srl %i1, 0, %o1 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: srl %i2, 0, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o0 + %result = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %result +} + +define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -96, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: mov %i1, %o1 +; SOFT-FLOAT-32-NEXT: mov %i2, %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: mov %i3, %o3 +; SOFT-FLOAT-32-NEXT: mov %i4, %o2 +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: mov %i5, %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i0 +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore %g0, %o1, %o1 +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: mov %i0, %o0 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %i1, %o1 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %i2, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o0 + %result = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %result +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -96, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %i1, %o1 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %i2, %o1 +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore %g0, %o0, %o0 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: srl %i0, 0, %o0 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: srl %i1, 0, %o1 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: srl %i2, 0, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o0 + %product = fmul contract float %a, %b + %result = fadd contract float %product, %c + ret float %result +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -96, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: mov %i1, %o1 +; SOFT-FLOAT-32-NEXT: mov %i2, %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: mov %i3, %o3 +; SOFT-FLOAT-32-NEXT: mov %i4, %o2 +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: mov %i5, %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i0 +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore %g0, %o1, %o1 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: mov %i0, %o0 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %i1, %o1 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %i2, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o0 + %product = fmul contract double %a, %b + %result = fadd contract double %product, %c + ret double %result +} + +define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -96, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: ld [%fp+100], %l0 +; SOFT-FLOAT-32-NEXT: ld [%fp+104], %l1 +; SOFT-FLOAT-32-NEXT: ld [%fp+108], %l2 +; SOFT-FLOAT-32-NEXT: ld [%fp+112], %l3 +; SOFT-FLOAT-32-NEXT: ld [%fp+96], %l4 +; SOFT-FLOAT-32-NEXT: ld [%fp+92], %l5 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %i4, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %l6 +; SOFT-FLOAT-32-NEXT: mov %i1, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %i5, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %i1 +; SOFT-FLOAT-32-NEXT: mov %i2, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %l5, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %i4 +; SOFT-FLOAT-32-NEXT: mov %i3, %o0 +; SOFT-FLOAT-32-NEXT: call __mulsf3 +; SOFT-FLOAT-32-NEXT: mov %l4, %o1 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %l3, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %i3 +; SOFT-FLOAT-32-NEXT: mov %i4, %o0 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %l2, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %i2 +; SOFT-FLOAT-32-NEXT: mov %i1, %o0 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %l1, %o1 +; SOFT-FLOAT-32-NEXT: mov %o0, %i1 +; SOFT-FLOAT-32-NEXT: mov %l6, %o0 +; SOFT-FLOAT-32-NEXT: call __addsf3 +; SOFT-FLOAT-32-NEXT: mov %l0, %o1 +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore %g0, %o0, %o0 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: ld [%fp+2267], %l0 +; SOFT-FLOAT-64-NEXT: ld [%fp+2259], %l1 +; SOFT-FLOAT-64-NEXT: ld [%fp+2251], %l2 +; SOFT-FLOAT-64-NEXT: ld [%fp+2243], %l3 +; SOFT-FLOAT-64-NEXT: ld [%fp+2227], %l4 +; SOFT-FLOAT-64-NEXT: ld [%fp+2235], %o1 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: srl %i3, 0, %o0 +; SOFT-FLOAT-64-NEXT: mov %o0, %i3 +; SOFT-FLOAT-64-NEXT: srl %i2, 0, %o0 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: mov %l4, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i2 +; SOFT-FLOAT-64-NEXT: srl %i1, 0, %o0 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: srl %i5, 0, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i1 +; SOFT-FLOAT-64-NEXT: srl %i0, 0, %o0 +; SOFT-FLOAT-64-NEXT: call __mulsf3 +; SOFT-FLOAT-64-NEXT: srl %i4, 0, %o1 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: mov %l3, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i0 +; SOFT-FLOAT-64-NEXT: mov %i1, %o0 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: mov %l2, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i1 +; SOFT-FLOAT-64-NEXT: mov %i2, %o0 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: mov %l1, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i2 +; SOFT-FLOAT-64-NEXT: mov %i3, %o0 +; SOFT-FLOAT-64-NEXT: call __addsf3 +; SOFT-FLOAT-64-NEXT: mov %l0, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o3 + %product = fmul contract <4 x float> %a, %b + %result = fadd contract <4 x float> %product, %c + ret <4 x float> %result +} + +define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32: .cfi_startproc +; SOFT-FLOAT-32-NEXT: ! %bb.0: +; SOFT-FLOAT-32-NEXT: save %sp, -128, %sp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-32-NEXT: .cfi_window_save +; SOFT-FLOAT-32-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-32-NEXT: ld [%fp+64], %l6 +; SOFT-FLOAT-32-NEXT: ld [%fp+156], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-4] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+160], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-8] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+148], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-12] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+152], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-16] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+140], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-20] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+144], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-24] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+132], %g2 +; SOFT-FLOAT-32-NEXT: st %g2, [%fp+-28] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: ld [%fp+136], %l7 +; SOFT-FLOAT-32-NEXT: ld [%fp+100], %l0 +; SOFT-FLOAT-32-NEXT: ld [%fp+104], %l1 +; SOFT-FLOAT-32-NEXT: ld [%fp+108], %l2 +; SOFT-FLOAT-32-NEXT: ld [%fp+112], %l3 +; SOFT-FLOAT-32-NEXT: ld [%fp+116], %l4 +; SOFT-FLOAT-32-NEXT: ld [%fp+120], %l5 +; SOFT-FLOAT-32-NEXT: ld [%fp+92], %o0 +; SOFT-FLOAT-32-NEXT: ld [%fp+96], %o1 +; SOFT-FLOAT-32-NEXT: ld [%fp+124], %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: ld [%fp+128], %o3 +; SOFT-FLOAT-32-NEXT: st %o0, [%fp+-32] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: st %o1, [%fp+-36] ! 4-byte Folded Spill +; SOFT-FLOAT-32-NEXT: mov %i4, %o0 +; SOFT-FLOAT-32-NEXT: mov %i5, %o1 +; SOFT-FLOAT-32-NEXT: mov %l4, %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: mov %l5, %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %l4 +; SOFT-FLOAT-32-NEXT: mov %o1, %l5 +; SOFT-FLOAT-32-NEXT: mov %i2, %o0 +; SOFT-FLOAT-32-NEXT: mov %i3, %o1 +; SOFT-FLOAT-32-NEXT: mov %l2, %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: mov %l3, %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i4 +; SOFT-FLOAT-32-NEXT: mov %o1, %i5 +; SOFT-FLOAT-32-NEXT: mov %i0, %o0 +; SOFT-FLOAT-32-NEXT: mov %i1, %o1 +; SOFT-FLOAT-32-NEXT: mov %l0, %o2 +; SOFT-FLOAT-32-NEXT: call __muldf3 +; SOFT-FLOAT-32-NEXT: mov %l1, %o3 +; SOFT-FLOAT-32-NEXT: ld [%fp+-28], %o2 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: mov %l7, %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i2 +; SOFT-FLOAT-32-NEXT: mov %o1, %i3 +; SOFT-FLOAT-32-NEXT: mov %i4, %o0 +; SOFT-FLOAT-32-NEXT: mov %i5, %o1 +; SOFT-FLOAT-32-NEXT: ld [%fp+-20], %o2 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: ld [%fp+-24], %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i4 +; SOFT-FLOAT-32-NEXT: mov %o1, %i5 +; SOFT-FLOAT-32-NEXT: mov %l4, %o0 +; SOFT-FLOAT-32-NEXT: mov %l5, %o1 +; SOFT-FLOAT-32-NEXT: ld [%fp+-12], %o2 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: ld [%fp+-16], %o3 +; SOFT-FLOAT-32-NEXT: mov %o0, %i0 +; SOFT-FLOAT-32-NEXT: mov %o1, %i1 +; SOFT-FLOAT-32-NEXT: ld [%fp+-32], %o0 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: ld [%fp+-36], %o1 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: ld [%fp+-4], %o2 ! 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: call __adddf3 +; SOFT-FLOAT-32-NEXT: ld [%fp+-8], %o3 +; SOFT-FLOAT-32-NEXT: ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1 +; SOFT-FLOAT-32-NEXT: ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1 +; SOFT-FLOAT-32-NEXT: std %o0, [%l6+24] +; SOFT-FLOAT-32-NEXT: std %i0, [%l6+16] +; SOFT-FLOAT-32-NEXT: std %i4, [%l6+8] +; SOFT-FLOAT-32-NEXT: std %i2, [%l6] +; SOFT-FLOAT-32-NEXT: ret +; SOFT-FLOAT-32-NEXT: restore +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64: .cfi_startproc +; SOFT-FLOAT-64-NEXT: ! %bb.0: +; SOFT-FLOAT-64-NEXT: save %sp, -176, %sp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_register %fp +; SOFT-FLOAT-64-NEXT: .cfi_window_save +; SOFT-FLOAT-64-NEXT: .cfi_register %o7, %i7 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2263], %l0 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2255], %l1 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2247], %l2 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2239], %l3 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2223], %l4 +; SOFT-FLOAT-64-NEXT: ldx [%fp+2231], %o1 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %i3, %o0 +; SOFT-FLOAT-64-NEXT: mov %o0, %i3 +; SOFT-FLOAT-64-NEXT: mov %i2, %o0 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %l4, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i2 +; SOFT-FLOAT-64-NEXT: mov %i1, %o0 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %i5, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i1 +; SOFT-FLOAT-64-NEXT: mov %i0, %o0 +; SOFT-FLOAT-64-NEXT: call __muldf3 +; SOFT-FLOAT-64-NEXT: mov %i4, %o1 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %l3, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i0 +; SOFT-FLOAT-64-NEXT: mov %i1, %o0 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %l2, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i1 +; SOFT-FLOAT-64-NEXT: mov %i2, %o0 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %l1, %o1 +; SOFT-FLOAT-64-NEXT: mov %o0, %i2 +; SOFT-FLOAT-64-NEXT: mov %i3, %o0 +; SOFT-FLOAT-64-NEXT: call __adddf3 +; SOFT-FLOAT-64-NEXT: mov %l0, %o1 +; SOFT-FLOAT-64-NEXT: ret +; SOFT-FLOAT-64-NEXT: restore %g0, %o0, %o3 + %product = fmul contract <4 x double> %a, %b + %result = fadd contract <4 x double> %product, %c + ret <4 x double> %result +} + +attributes #0 = { "use-soft-float"="true" } + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare double @llvm.fmuladd.f64(double %a, double %b, double %c) diff --git a/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll new file mode 100644 index 000000000000..b01c348b631b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=s390x < %s | FileCheck %s -check-prefix=SOFT-FLOAT + +define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r13, %r15, 104(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -160 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 320 +; SOFT-FLOAT-NEXT: llgfr %r2, %r2 +; SOFT-FLOAT-NEXT: llgfr %r3, %r3 +; SOFT-FLOAT-NEXT: lr %r13, %r4 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: llgfr %r3, %r13 +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: # kill: def $r2l killed $r2l killed $r2d +; SOFT-FLOAT-NEXT: lmg %r13, %r15, 264(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %result = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %result +} + +define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r13, %r15, 104(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -160 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 320 +; SOFT-FLOAT-NEXT: lgr %r13, %r4 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r13 +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lmg %r13, %r15, 264(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %result = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %result +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r13, %r15, 104(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -160 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 320 +; SOFT-FLOAT-NEXT: llgfr %r2, %r2 +; SOFT-FLOAT-NEXT: llgfr %r3, %r3 +; SOFT-FLOAT-NEXT: lr %r13, %r4 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: llgfr %r3, %r13 +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: # kill: def $r2l killed $r2l killed $r2d +; SOFT-FLOAT-NEXT: lmg %r13, %r15, 264(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %product = fmul contract float %a, %b + %result = fadd contract float %product, %c + ret float %result +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r13, %r15, 104(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -160 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 320 +; SOFT-FLOAT-NEXT: lgr %r13, %r4 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r13 +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lmg %r13, %r15, 264(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %product = fmul contract double %a, %b + %result = fadd contract double %product, %c + ret double %result +} + +define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r7, %r15, 56(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r7, -104 +; SOFT-FLOAT-NEXT: .cfi_offset %r8, -96 +; SOFT-FLOAT-NEXT: .cfi_offset %r9, -88 +; SOFT-FLOAT-NEXT: .cfi_offset %r10, -80 +; SOFT-FLOAT-NEXT: .cfi_offset %r11, -72 +; SOFT-FLOAT-NEXT: .cfi_offset %r12, -64 +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -176 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 336 +; SOFT-FLOAT-NEXT: llgf %r0, 388(%r15) +; SOFT-FLOAT-NEXT: stg %r0, 168(%r15) # 8-byte Folded Spill +; SOFT-FLOAT-NEXT: llgf %r0, 380(%r15) +; SOFT-FLOAT-NEXT: stg %r0, 160(%r15) # 8-byte Folded Spill +; SOFT-FLOAT-NEXT: llgf %r11, 372(%r15) +; SOFT-FLOAT-NEXT: llgf %r10, 364(%r15) +; SOFT-FLOAT-NEXT: llgf %r8, 340(%r15) +; SOFT-FLOAT-NEXT: llgf %r0, 356(%r15) +; SOFT-FLOAT-NEXT: llgf %r7, 348(%r15) +; SOFT-FLOAT-NEXT: llgfr %r1, %r5 +; SOFT-FLOAT-NEXT: lr %r9, %r4 +; SOFT-FLOAT-NEXT: lr %r13, %r3 +; SOFT-FLOAT-NEXT: lr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r1 +; SOFT-FLOAT-NEXT: lgr %r3, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: llgfr %r0, %r9 +; SOFT-FLOAT-NEXT: lgr %r9, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: lgr %r3, %r7 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: llgfr %r0, %r13 +; SOFT-FLOAT-NEXT: lgr %r13, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: lgr %r3, %r8 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: llgfr %r0, %r12 +; SOFT-FLOAT-NEXT: llgfr %r3, %r6 +; SOFT-FLOAT-NEXT: lgr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __mulsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r10 +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r10, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r12 +; SOFT-FLOAT-NEXT: lgr %r3, %r11 +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r13 +; SOFT-FLOAT-NEXT: lg %r3, 160(%r15) # 8-byte Folded Reload +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r13, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r9 +; SOFT-FLOAT-NEXT: lg %r3, 168(%r15) # 8-byte Folded Reload +; SOFT-FLOAT-NEXT: brasl %r14, __addsf3@PLT +; SOFT-FLOAT-NEXT: lgr %r5, %r2 +; SOFT-FLOAT-NEXT: lr %r2, %r10 +; SOFT-FLOAT-NEXT: lr %r3, %r12 +; SOFT-FLOAT-NEXT: lr %r4, %r13 +; SOFT-FLOAT-NEXT: # kill: def $r5l killed $r5l killed $r5d +; SOFT-FLOAT-NEXT: lmg %r7, %r15, 232(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %product = fmul contract <4 x float> %a, %b + %result = fadd contract <4 x float> %product, %c + ret <4 x float> %result +} + +define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; SOFT-FLOAT-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT: # %bb.0: +; SOFT-FLOAT-NEXT: stmg %r6, %r15, 48(%r15) +; SOFT-FLOAT-NEXT: .cfi_offset %r6, -112 +; SOFT-FLOAT-NEXT: .cfi_offset %r7, -104 +; SOFT-FLOAT-NEXT: .cfi_offset %r8, -96 +; SOFT-FLOAT-NEXT: .cfi_offset %r9, -88 +; SOFT-FLOAT-NEXT: .cfi_offset %r10, -80 +; SOFT-FLOAT-NEXT: .cfi_offset %r11, -72 +; SOFT-FLOAT-NEXT: .cfi_offset %r12, -64 +; SOFT-FLOAT-NEXT: .cfi_offset %r13, -56 +; SOFT-FLOAT-NEXT: .cfi_offset %r14, -48 +; SOFT-FLOAT-NEXT: .cfi_offset %r15, -40 +; SOFT-FLOAT-NEXT: aghi %r15, -184 +; SOFT-FLOAT-NEXT: .cfi_def_cfa_offset 344 +; SOFT-FLOAT-NEXT: mvc 176(8,%r15), 24(%r4) # 8-byte Folded Spill +; SOFT-FLOAT-NEXT: mvc 168(8,%r15), 16(%r4) # 8-byte Folded Spill +; SOFT-FLOAT-NEXT: mvc 160(8,%r15), 8(%r4) # 8-byte Folded Spill +; SOFT-FLOAT-NEXT: lg %r10, 0(%r4) +; SOFT-FLOAT-NEXT: lg %r9, 0(%r2) +; SOFT-FLOAT-NEXT: lg %r8, 0(%r3) +; SOFT-FLOAT-NEXT: lg %r7, 8(%r2) +; SOFT-FLOAT-NEXT: lg %r6, 8(%r3) +; SOFT-FLOAT-NEXT: lg %r13, 16(%r2) +; SOFT-FLOAT-NEXT: lg %r2, 24(%r2) +; SOFT-FLOAT-NEXT: lg %r0, 24(%r3) +; SOFT-FLOAT-NEXT: lg %r12, 16(%r3) +; SOFT-FLOAT-NEXT: lgr %r3, %r0 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r11, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r13 +; SOFT-FLOAT-NEXT: lgr %r3, %r12 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r13, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r7 +; SOFT-FLOAT-NEXT: lgr %r3, %r6 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r9 +; SOFT-FLOAT-NEXT: lgr %r3, %r8 +; SOFT-FLOAT-NEXT: brasl %r14, __muldf3@PLT +; SOFT-FLOAT-NEXT: lgr %r3, %r10 +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lgr %r10, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r12 +; SOFT-FLOAT-NEXT: lg %r3, 160(%r15) # 8-byte Folded Reload +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lgr %r12, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r13 +; SOFT-FLOAT-NEXT: lg %r3, 168(%r15) # 8-byte Folded Reload +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lgr %r13, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r11 +; SOFT-FLOAT-NEXT: lg %r3, 176(%r15) # 8-byte Folded Reload +; SOFT-FLOAT-NEXT: brasl %r14, __adddf3@PLT +; SOFT-FLOAT-NEXT: lgr %r5, %r2 +; SOFT-FLOAT-NEXT: lgr %r2, %r10 +; SOFT-FLOAT-NEXT: lgr %r3, %r12 +; SOFT-FLOAT-NEXT: lgr %r4, %r13 +; SOFT-FLOAT-NEXT: lmg %r6, %r15, 232(%r15) +; SOFT-FLOAT-NEXT: br %r14 + %product = fmul contract <4 x double> %a, %b + %result = fadd contract <4 x double> %product, %c + ret <4 x double> %result +} + +attributes #0 = { "use-soft-float"="true" } + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare double @llvm.fmuladd.f64(double %a, double %b, double %c) diff --git a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll new file mode 100644 index 000000000000..ccb2f37590b0 --- /dev/null +++ b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll @@ -0,0 +1,1777 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=i386 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32 +; RUN: llc -mtriple=i386 -mattr +fma < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32-FMA +; RUN: llc -mtriple=i386 -mattr +fma4 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-32-FMA4 +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64 +; RUN: llc -mtriple=x86_64 -mattr +fma < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64-FMA +; RUN: llc -mtriple=x86_64 -mattr +fma4 < %s | FileCheck %s -check-prefix=SOFT-FLOAT-64-FMA4 + +define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_intrinsic_f32: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %result = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %result +} + +define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: popl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: popl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl +; +; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_intrinsic_f64: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %result = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %result +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_contract_f32: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movl %edx, %ebx +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl %ebx, %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %product = fmul contract float %a, %b + %result = fadd contract float %product, %c + ret float %result +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: popl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: popl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %edi, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_contract_f64: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rdx, %rbx +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %product = fmul contract double %a, %b + %result = fadd contract double %product, %c + ret double %result +} + +define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-NEXT: pushl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __addsf3 +; SOFT-FLOAT-32-NEXT: addl $8, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-NEXT: movl %eax, 12(%esi) +; SOFT-FLOAT-32-NEXT: movl %ebx, 8(%esi) +; SOFT-FLOAT-32-NEXT: movl %edi, 4(%esi) +; SOFT-FLOAT-32-NEXT: movl %ebp, (%esi) +; SOFT-FLOAT-32-NEXT: movl %esi, %eax +; SOFT-FLOAT-32-NEXT: addl $4, %esp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-NEXT: popl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: popl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: popl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl $4 +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, 12(%esi) +; SOFT-FLOAT-32-FMA-NEXT: movl %ebx, 8(%esi) +; SOFT-FLOAT-32-FMA-NEXT: movl %edi, 4(%esi) +; SOFT-FLOAT-32-FMA-NEXT: movl %ebp, (%esi) +; SOFT-FLOAT-32-FMA-NEXT: movl %esi, %eax +; SOFT-FLOAT-32-FMA-NEXT: addl $4, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA-NEXT: popl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: popl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: popl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl $4 +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __mulsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %ebx +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __addsf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $8, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, 12(%esi) +; SOFT-FLOAT-32-FMA4-NEXT: movl %ebx, 8(%esi) +; SOFT-FLOAT-32-FMA4-NEXT: movl %edi, 4(%esi) +; SOFT-FLOAT-32-FMA4-NEXT: movl %ebp, (%esi) +; SOFT-FLOAT-32-FMA4-NEXT: movl %esi, %eax +; SOFT-FLOAT-32-FMA4-NEXT: addl $4, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA4-NEXT: popl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: popl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl $4 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: pushq %r15 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-NEXT: pushq %r14 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-NEXT: pushq %r13 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-NEXT: pushq %r12 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-NEXT: pushq %rax +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-NEXT: movl %r9d, %r13d +; SOFT-FLOAT-64-NEXT: movl %ecx, %ebp +; SOFT-FLOAT-64-NEXT: movl %edx, %r14d +; SOFT-FLOAT-64-NEXT: movl %esi, %r12d +; SOFT-FLOAT-64-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: movl %r8d, %edi +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %r15d +; SOFT-FLOAT-64-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-NEXT: movl %r12d, %edi +; SOFT-FLOAT-64-NEXT: movl %r13d, %esi +; SOFT-FLOAT-64-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %r12d +; SOFT-FLOAT-64-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-NEXT: movl %r15d, %edi +; SOFT-FLOAT-64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-NEXT: movl %eax, 12(%rbx) +; SOFT-FLOAT-64-NEXT: movl %ebp, 8(%rbx) +; SOFT-FLOAT-64-NEXT: movl %r14d, 4(%rbx) +; SOFT-FLOAT-64-NEXT: movl %r12d, (%rbx) +; SOFT-FLOAT-64-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-NEXT: popq %r12 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-NEXT: popq %r13 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-NEXT: popq %r14 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-NEXT: popq %r15 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: popq %rbp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r15 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r14 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r13 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r12 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA-NEXT: pushq %rax +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA-NEXT: movl %r9d, %r13d +; SOFT-FLOAT-64-FMA-NEXT: movl %ecx, %ebp +; SOFT-FLOAT-64-FMA-NEXT: movl %edx, %r14d +; SOFT-FLOAT-64-FMA-NEXT: movl %esi, %r12d +; SOFT-FLOAT-64-FMA-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: movl %r8d, %edi +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %r15d +; SOFT-FLOAT-64-FMA-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-FMA-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-FMA-NEXT: movl %r12d, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl %r13d, %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %r12d +; SOFT-FLOAT-64-FMA-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-FMA-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-FMA-NEXT: movl %r15d, %edi +; SOFT-FLOAT-64-FMA-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movl %eax, 12(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movl %ebp, 8(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movl %r14d, 4(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movl %r12d, (%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-FMA-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA-NEXT: popq %r12 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA-NEXT: popq %r13 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA-NEXT: popq %r14 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA-NEXT: popq %r15 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: popq %rbp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_contract_v4f32: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r15 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r14 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r13 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r12 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rax +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movl %r9d, %r13d +; SOFT-FLOAT-64-FMA4-NEXT: movl %ecx, %ebp +; SOFT-FLOAT-64-FMA4-NEXT: movl %edx, %r14d +; SOFT-FLOAT-64-FMA4-NEXT: movl %esi, %r12d +; SOFT-FLOAT-64-FMA4-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: movl %r8d, %edi +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %r15d +; SOFT-FLOAT-64-FMA4-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-FMA4-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-FMA4-NEXT: movl %r12d, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl %r13d, %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __mulsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %r12d +; SOFT-FLOAT-64-FMA4-NEXT: movl %r14d, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %r14d +; SOFT-FLOAT-64-FMA4-NEXT: movl %ebp, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, %ebp +; SOFT-FLOAT-64-FMA4-NEXT: movl %r15d, %edi +; SOFT-FLOAT-64-FMA4-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SOFT-FLOAT-64-FMA4-NEXT: callq __addsf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movl %eax, 12(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movl %ebp, 8(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movl %r14d, 4(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movl %r12d, (%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-FMA4-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r12 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r13 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r14 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r15 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %product = fmul contract <4 x float> %a, %b + %result = fadd contract <4 x float> %product, %c + ret <4 x float> %result +} + +define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; SOFT-FLOAT-32-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32: # %bb.0: +; SOFT-FLOAT-32-NEXT: pushl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-NEXT: subl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 36 +; SOFT-FLOAT-32-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-NEXT: pushl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, %esi +; SOFT-FLOAT-32-NEXT: movl %edx, %ebp +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __muldf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %eax +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-NEXT: movl %edx, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %edi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl %esi +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-NEXT: movl %edx, %esi +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: calll __adddf3 +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SOFT-FLOAT-32-NEXT: movl %edx, 28(%ecx) +; SOFT-FLOAT-32-NEXT: movl %eax, 24(%ecx) +; SOFT-FLOAT-32-NEXT: movl %esi, 20(%ecx) +; SOFT-FLOAT-32-NEXT: movl %ebp, 16(%ecx) +; SOFT-FLOAT-32-NEXT: movl %ebx, 12(%ecx) +; SOFT-FLOAT-32-NEXT: movl %edi, 8(%ecx) +; SOFT-FLOAT-32-NEXT: movl (%esp), %eax # 4-byte Reload +; SOFT-FLOAT-32-NEXT: movl %eax, 4(%ecx) +; SOFT-FLOAT-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SOFT-FLOAT-32-NEXT: movl %eax, (%ecx) +; SOFT-FLOAT-32-NEXT: movl %ecx, %eax +; SOFT-FLOAT-32-NEXT: addl $16, %esp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-NEXT: popl %esi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-NEXT: popl %edi +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-NEXT: popl %ebx +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-NEXT: popl %ebp +; SOFT-FLOAT-32-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-NEXT: retl $4 +; +; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32-FMA: # %bb.0: +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA-NEXT: subl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 36 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-FMA-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %esi +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, %ebp +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, %esi +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SOFT-FLOAT-32-FMA-NEXT: movl %edx, 28(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, 24(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %esi, 20(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %ebp, 16(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %ebx, 12(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %edi, 8(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl (%esp), %eax # 4-byte Reload +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, 4(%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SOFT-FLOAT-32-FMA-NEXT: movl %eax, (%ecx) +; SOFT-FLOAT-32-FMA-NEXT: movl %ecx, %eax +; SOFT-FLOAT-32-FMA-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA-NEXT: popl %esi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA-NEXT: popl %edi +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA-NEXT: popl %ebx +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA-NEXT: popl %ebp +; SOFT-FLOAT-32-FMA-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA-NEXT: retl $4 +; +; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-32-FMA4: # %bb.0: +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA4-NEXT: subl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 36 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %esi, -20 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %edi, -16 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %ebx, -12 +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_offset %ebp, -8 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %edi +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %esi +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %esi +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, %ebp +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __muldf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %eax +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, (%esp) # 4-byte Spill +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %edi +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, %ebx +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, %ebp +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, %esi +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[0-9]+}}(%esp) +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: calll __adddf3 +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_adjust_cfa_offset -16 +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SOFT-FLOAT-32-FMA4-NEXT: movl %edx, 28(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, 24(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %esi, 20(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %ebp, 16(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %ebx, 12(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %edi, 8(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl (%esp), %eax # 4-byte Reload +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, 4(%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; SOFT-FLOAT-32-FMA4-NEXT: movl %eax, (%ecx) +; SOFT-FLOAT-32-FMA4-NEXT: movl %ecx, %eax +; SOFT-FLOAT-32-FMA4-NEXT: addl $16, %esp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 20 +; SOFT-FLOAT-32-FMA4-NEXT: popl %esi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-32-FMA4-NEXT: popl %edi +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 12 +; SOFT-FLOAT-32-FMA4-NEXT: popl %ebx +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-32-FMA4-NEXT: popl %ebp +; SOFT-FLOAT-32-FMA4-NEXT: .cfi_def_cfa_offset 4 +; SOFT-FLOAT-32-FMA4-NEXT: retl $4 +; +; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64: # %bb.0: +; SOFT-FLOAT-64-NEXT: pushq %rbp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: pushq %r15 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-NEXT: pushq %r14 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-NEXT: pushq %r13 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-NEXT: pushq %r12 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-NEXT: pushq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-NEXT: pushq %rax +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-NEXT: movq %rcx, %r14 +; SOFT-FLOAT-64-NEXT: movq %rdx, %r15 +; SOFT-FLOAT-64-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: movq %r8, %rdi +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-NEXT: movq %r12, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-NEXT: movq %rax, 24(%rbx) +; SOFT-FLOAT-64-NEXT: movq %r14, 16(%rbx) +; SOFT-FLOAT-64-NEXT: movq %r15, 8(%rbx) +; SOFT-FLOAT-64-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-NEXT: popq %rbx +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-NEXT: popq %r12 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-NEXT: popq %r13 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-NEXT: popq %r14 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-NEXT: popq %r15 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-NEXT: popq %rbp +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-NEXT: retq +; +; SOFT-FLOAT-64-FMA-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64-FMA: # %bb.0: +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r15 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r14 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r13 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA-NEXT: pushq %r12 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA-NEXT: pushq %rax +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA-NEXT: movq %rcx, %r14 +; SOFT-FLOAT-64-FMA-NEXT: movq %rdx, %r15 +; SOFT-FLOAT-64-FMA-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-FMA-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: movq %r8, %rdi +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-FMA-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-FMA-NEXT: movq %r12, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-FMA-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-FMA-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-FMA-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA-NEXT: movq %rax, 24(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %r14, 16(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %r15, 8(%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-FMA-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-FMA-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA-NEXT: popq %r12 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA-NEXT: popq %r13 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA-NEXT: popq %r14 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA-NEXT: popq %r15 +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA-NEXT: popq %rbp +; SOFT-FLOAT-64-FMA-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA-NEXT: retq +; +; SOFT-FLOAT-64-FMA4-LABEL: fmuladd_contract_v4f64: +; SOFT-FLOAT-64-FMA4: # %bb.0: +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r15 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r14 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r13 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %r12 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA4-NEXT: pushq %rax +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 64 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbx, -56 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r12, -48 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r13, -40 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r14, -32 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %r15, -24 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_offset %rbp, -16 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rcx, %r14 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rdx, %r15 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rsi, %r12 +; SOFT-FLOAT-64-FMA4-NEXT: movq %rdi, %rbx +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: movq %r8, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r13 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbp, %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __muldf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r12 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r15 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, %r14 +; SOFT-FLOAT-64-FMA4-NEXT: movq %r13, %rdi +; SOFT-FLOAT-64-FMA4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SOFT-FLOAT-64-FMA4-NEXT: callq __adddf3@PLT +; SOFT-FLOAT-64-FMA4-NEXT: movq %rax, 24(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %r14, 16(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %r15, 8(%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %r12, (%rbx) +; SOFT-FLOAT-64-FMA4-NEXT: movq %rbx, %rax +; SOFT-FLOAT-64-FMA4-NEXT: addq $8, %rsp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 56 +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbx +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 48 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r12 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 40 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r13 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 32 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r14 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 24 +; SOFT-FLOAT-64-FMA4-NEXT: popq %r15 +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 16 +; SOFT-FLOAT-64-FMA4-NEXT: popq %rbp +; SOFT-FLOAT-64-FMA4-NEXT: .cfi_def_cfa_offset 8 +; SOFT-FLOAT-64-FMA4-NEXT: retq + %product = fmul contract <4 x double> %a, %b + %result = fadd contract <4 x double> %product, %c + ret <4 x double> %result +} + +attributes #0 = { "use-soft-float"="true" } + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare double @llvm.fmuladd.f64(double %a, double %b, double %c) -- GitLab From 5aec88f0e6920b27dbc6cf7b4625088291441210 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 19 Oct 2024 09:57:56 -0400 Subject: [PATCH 156/511] [hwasan], [gn]: Fix formatting of hwasan cmake; re-sync gn file for b515d9ea1e43 --- compiler-rt/lib/hwasan/CMakeLists.txt | 9 ++++++--- .../utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn | 10 ++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt index 086079c7536e..afafa0c4a927 100644 --- a/compiler-rt/lib/hwasan/CMakeLists.txt +++ b/compiler-rt/lib/hwasan/CMakeLists.txt @@ -24,16 +24,19 @@ foreach(arch ${HWASAN_SUPPORTED_ARCH}) if(${arch} MATCHES "aarch64") list(APPEND HWASAN_RTL_SOURCES hwasan_setjmp_aarch64.S - hwasan_tag_mismatch_aarch64.S) + hwasan_tag_mismatch_aarch64.S + ) endif() if(${arch} MATCHES "riscv64") list(APPEND HWASAN_RTL_SOURCES hwasan_setjmp_riscv64.S - hwasan_tag_mismatch_riscv64.S) + hwasan_tag_mismatch_riscv64.S + ) endif() if(${arch} MATCHES "x86_64") list(APPEND HWASAN_RTL_SOURCES - hwasan_setjmp_x86_64.S) + hwasan_setjmp_x86_64.S + ) endif() endforeach() diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn index f453dde0ea93..e39d8114d1f4 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn @@ -64,10 +64,16 @@ source_set("sources") { "hwasan_type_test.cpp", ] if (current_cpu == "arm64") { - sources += [ "hwasan_setjmp_aarch64.S" ] + sources += [ + "hwasan_setjmp_aarch64.S", + "hwasan_tag_mismatch_aarch64.S", + ] } if (current_cpu == "riscv64") { - sources += [ "hwasan_setjmp_riscv64.S" ] + sources += [ + "hwasan_setjmp_riscv64.S", + "hwasan_tag_mismatch_riscv64.S", + ] } if (current_cpu == "x64") { sources += [ "hwasan_setjmp_x86_64.S" ] -- GitLab From 0f0a96b8621fcc8e1d6b6a3d047c263bb17a7f39 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Sat, 19 Oct 2024 10:05:36 -0400 Subject: [PATCH 157/511] [llvm][NVPTX] Strip unneeded '+0' in PTX load/store (#113017) Remove the extraneous '+0' immediate offset part in PTX load/stores, to improve readability of output PTX code. --- clang/test/CodeGenCUDA/bf16.cu | 8 +- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 + .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 2 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 114 +++--- .../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 12 +- llvm/test/CodeGen/NVPTX/activemask.ll | 4 +- llvm/test/CodeGen/NVPTX/addr-mode.ll | 10 +- llvm/test/CodeGen/NVPTX/aggregate-return.ll | 8 +- llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 190 +++++----- .../NVPTX/bf16x2-instructions-approx.ll | 4 +- .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 60 +-- llvm/test/CodeGen/NVPTX/bswap.ll | 8 +- .../CodeGen/NVPTX/call-with-alloca-buffer.ll | 4 +- llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll | 4 +- llvm/test/CodeGen/NVPTX/chain-different-as.ll | 2 +- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 16 +- llvm/test/CodeGen/NVPTX/combine-mad.ll | 16 +- .../CodeGen/NVPTX/compute-ptx-value-vts.ll | 8 +- llvm/test/CodeGen/NVPTX/convert-int-sm20.ll | 12 +- llvm/test/CodeGen/NVPTX/copysign.ll | 12 +- llvm/test/CodeGen/NVPTX/dot-product.ll | 26 +- llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll | 4 +- llvm/test/CodeGen/NVPTX/elect.ll | 6 +- llvm/test/CodeGen/NVPTX/extractelement.ll | 12 +- llvm/test/CodeGen/NVPTX/f16-instructions.ll | 158 ++++---- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 158 ++++---- llvm/test/CodeGen/NVPTX/i128-param.ll | 8 +- llvm/test/CodeGen/NVPTX/i128-retval.ll | 4 +- llvm/test/CodeGen/NVPTX/i128-struct.ll | 2 +- llvm/test/CodeGen/NVPTX/i128.ll | 18 +- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 100 ++--- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 118 +++--- llvm/test/CodeGen/NVPTX/indirect_byval.ll | 16 +- llvm/test/CodeGen/NVPTX/jump-table.ll | 14 +- llvm/test/CodeGen/NVPTX/ldparam-v4.ll | 2 +- llvm/test/CodeGen/NVPTX/local-stack-frame.ll | 4 +- llvm/test/CodeGen/NVPTX/lower-alloca.ll | 2 +- .../CodeGen/NVPTX/lower-args-gridconstant.ll | 28 +- llvm/test/CodeGen/NVPTX/lower-args.ll | 6 +- llvm/test/CodeGen/NVPTX/math-intrins.ll | 174 ++++----- llvm/test/CodeGen/NVPTX/mulhi-intrins.ll | 12 +- .../CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 30 +- llvm/test/CodeGen/NVPTX/param-load-store.ll | 344 +++++++++--------- llvm/test/CodeGen/NVPTX/param-overalign.ll | 16 +- .../CodeGen/NVPTX/param-vectorize-device.ll | 76 ++-- .../CodeGen/NVPTX/proxy-reg-erasure-ptx.ll | 62 ++-- llvm/test/CodeGen/NVPTX/rcp-opt.ll | 6 +- llvm/test/CodeGen/NVPTX/rotate.ll | 48 +-- llvm/test/CodeGen/NVPTX/rotate_64.ll | 4 +- llvm/test/CodeGen/NVPTX/sad-intrins.ll | 12 +- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 4 +- llvm/test/CodeGen/NVPTX/st-param-imm.ll | 166 ++++----- llvm/test/CodeGen/NVPTX/store-undef.ll | 2 +- llvm/test/CodeGen/NVPTX/tex-read-cuda.ll | 4 +- llvm/test/CodeGen/NVPTX/tid-range.ll | 2 +- .../NVPTX/unaligned-param-load-store.ll | 42 +-- ...unfold-masked-merge-vector-variablemask.ll | 58 +-- llvm/test/CodeGen/NVPTX/vaargs.ll | 22 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 48 +-- llvm/test/CodeGen/NVPTX/vec-param-load.ll | 14 +- llvm/test/CodeGen/NVPTX/vector-args.ll | 2 +- llvm/test/CodeGen/NVPTX/vector-call.ll | 4 +- llvm/test/CodeGen/NVPTX/vector-returns.ll | 76 ++-- .../DebugInfo/NVPTX/dbg-declare-alloca.ll | 2 +- .../NaryReassociate/NVPTX/nary-slsr.ll | 6 +- .../Inputs/nvptx-basic.ll.expected | 8 +- 66 files changed, 1225 insertions(+), 1209 deletions(-) diff --git a/clang/test/CodeGenCUDA/bf16.cu b/clang/test/CodeGenCUDA/bf16.cu index 3c443420dbd3..f794b83239f1 100644 --- a/clang/test/CodeGenCUDA/bf16.cu +++ b/clang/test/CodeGenCUDA/bf16.cu @@ -25,7 +25,7 @@ __device__ void test_arg(__bf16 *out, __bf16 in) { __device__ __bf16 test_ret( __bf16 in) { // CHECK: ld.param.b16 %[[R:rs[0-9]+]], [_Z8test_retDF16b_param_0]; return in; -// CHECK: st.param.b16 [func_retval0+0], %[[R]] +// CHECK: st.param.b16 [func_retval0], %[[R]] // CHECK: ret; } @@ -35,15 +35,15 @@ __device__ __bf16 external_func( __bf16 in); // CHECK: .param .align 2 .b8 _Z9test_callDF16b_param_0[2] __device__ __bf16 test_call( __bf16 in) { // CHECK: ld.param.b16 %[[R:rs[0-9]+]], [_Z9test_callDF16b_param_0]; -// CHECK: st.param.b16 [param0+0], %[[R]]; +// CHECK: st.param.b16 [param0], %[[R]]; // CHECK: .param .align 2 .b8 retval0[2]; // CHECK: call.uni (retval0), // CHECK-NEXT: _Z13external_funcDF16b, // CHECK-NEXT: ( // CHECK-NEXT: param0 // CHECK-NEXT ); -// CHECK: ld.param.b16 %[[RET:rs[0-9]+]], [retval0+0]; +// CHECK: ld.param.b16 %[[RET:rs[0-9]+]], [retval0]; return external_func(in); -// CHECK: st.param.b16 [func_retval0+0], %[[RET]] +// CHECK: st.param.b16 [func_retval0], %[[RET]] // CHECK: ret; } diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 7d6442a61112..9b5892844632 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -363,6 +363,16 @@ void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum, } } +void NVPTXInstPrinter::printOffseti32imm(const MCInst *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + auto &Op = MI->getOperand(OpNum); + assert(Op.isImm() && "Invalid operand"); + if (Op.getImm() != 0) { + O << "+"; + printOperand(MI, OpNum, O); + } +} + void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O, const char *Modifier) { const MCOperand &Op = MI->getOperand(OpNum); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index e6954f861cd1..e8a4a6dbdd53 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -45,6 +45,8 @@ public: const char *Modifier = nullptr); void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, const char *Modifier = nullptr); + void printOffseti32imm(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O, const char *Modifier = nullptr); void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 8b34ce4f1001..b5478b8f09ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1934,6 +1934,10 @@ def MmaCode : Operand { let PrintMethod = "printMmaCode"; } +def Offseti32imm : Operand { + let PrintMethod = "printOffseti32imm"; +} + def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; @@ -2482,21 +2486,21 @@ def ProxyReg : let mayLoad = true in { class LoadParamMemInst : - NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), + NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), + !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), []>; class LoadParamV2MemInst : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), + NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0+$b];"), []>; + " \t{{$dst, $dst2}}, [retval0$b];"), []>; class LoadParamV4MemInst : NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins i32imm:$b), + (ins Offseti32imm:$b), !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), + " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), []>; } @@ -2512,8 +2516,8 @@ let mayStore = true in { if !or(support_imm, !isa(op)) then def _ # !if(!isa(op), "r", "i") : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, i32imm:$b), - "st.param" # opstr # " \t[param$a+$b], $val;", + (ins op:$val, i32imm:$a, Offseti32imm:$b), + "st.param" # opstr # " \t[param$a$b], $val;", []>; } @@ -2524,8 +2528,8 @@ let mayStore = true in { # !if(!isa(op2), "r", "i") : NVPTXInst<(outs), (ins op1:$val1, op2:$val2, - i32imm:$a, i32imm:$b), - "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};", + i32imm:$a, Offseti32imm:$b), + "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", []>; } @@ -2541,29 +2545,29 @@ let mayStore = true in { : NVPTXInst<(outs), (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, i32imm:$b), + i32imm:$a, Offseti32imm:$b), "st.param.v4" # opstr # - " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};", + " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", []>; } class StoreRetvalInst : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), - !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), + NVPTXInst<(outs), (ins regclass:$val, Offseti32imm:$a), + !strconcat("st.param", opstr, " \t[func_retval0$a], $val;"), []>; class StoreRetvalV2Inst : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, Offseti32imm:$a), !strconcat("st.param.v2", opstr, - " \t[func_retval0+$a], {{$val, $val2}};"), + " \t[func_retval0$a], {{$val, $val2}};"), []>; class StoreRetvalV4Inst : NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, - regclass:$val4, i32imm:$a), + regclass:$val4, Offseti32imm:$a), !strconcat("st.param.v4", opstr, - " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), + " \t[func_retval0$a], {{$val, $val2, $val3, $val4}};"), []>; } @@ -2827,21 +2831,21 @@ multiclass LD { def _ari : NVPTXInst< (outs regclass:$dst), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + "\t$dst, [$addr$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + "\t$dst, [$addr$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + "\t$dst, [$addr$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -2876,23 +2880,23 @@ multiclass ST { (outs), (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, - i32imm:$offset), + Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + " \t[$addr$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, - i32imm:$offset), + Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + " \t[$addr$offset], $src;", []>; def _asi : NVPTXInst< (outs), (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, - i32imm:$offset), + Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + " \t[$addr$offset], $src;", []>; } let mayStore=1, hasSideEffects=0 in { @@ -2929,21 +2933,21 @@ multiclass LD_VEC { def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, @@ -2965,21 +2969,21 @@ multiclass LD_VEC { def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { defm LDV_i8 : LD_VEC; @@ -3016,23 +3020,23 @@ multiclass ST_VEC { (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, - Int32Regs:$addr, i32imm:$offset), + Int32Regs:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, - Int64Regs:$addr, i32imm:$offset), + Int64Regs:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, - imem:$addr, i32imm:$offset), + imem:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, @@ -3058,23 +3062,23 @@ multiclass ST_VEC { (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}" - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + "$fromWidth \t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; } let mayStore=1, hasSideEffects=0 in { @@ -3903,4 +3907,4 @@ def atomic_thread_fence_seq_cst_cta : Requires<[hasPTX<60>, hasSM<70>]>; def atomic_thread_fence_acq_rel_cta : NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; \ No newline at end of file + Requires<[hasPTX<60>, hasSM<70>]>; diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index bc58a700cb98..028fab7ae54d 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -19,7 +19,7 @@ define i32 @f(ptr %p) { ; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0]; ; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; ; ENABLED-NEXT: add.s32 %r3, %r1, %r2; -; ENABLED-NEXT: st.param.b32 [func_retval0+0], %r3; +; ENABLED-NEXT: st.param.b32 [func_retval0], %r3; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: f( @@ -32,7 +32,7 @@ define i32 @f(ptr %p) { ; DISABLED-NEXT: ld.u32 %r1, [%rd1]; ; DISABLED-NEXT: ld.u32 %r2, [%rd1+4]; ; DISABLED-NEXT: add.s32 %r3, %r1, %r2; -; DISABLED-NEXT: st.param.b32 [func_retval0+0], %r3; +; DISABLED-NEXT: st.param.b32 [func_retval0], %r3; ; DISABLED-NEXT: ret; %p.1 = getelementptr i32, ptr %p, i32 1 %v0 = load i32, ptr %p, align 8 @@ -68,7 +68,7 @@ define half @fh(ptr %p) { ; ENABLED-NEXT: cvt.f32.f16 %f11, %rs5; ; ENABLED-NEXT: add.rn.f32 %f12, %f10, %f11; ; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12; -; ENABLED-NEXT: st.param.b16 [func_retval0+0], %rs9; +; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: fh( @@ -100,7 +100,7 @@ define half @fh(ptr %p) { ; DISABLED-NEXT: cvt.f32.f16 %f11, %rs5; ; DISABLED-NEXT: add.rn.f32 %f12, %f10, %f11; ; DISABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12; -; DISABLED-NEXT: st.param.b16 [func_retval0+0], %rs9; +; DISABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; DISABLED-NEXT: ret; %p.1 = getelementptr half, ptr %p, i32 1 %p.2 = getelementptr half, ptr %p, i32 2 @@ -132,7 +132,7 @@ define float @ff(ptr %p) { ; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4; ; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7; ; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5; -; ENABLED-NEXT: st.param.f32 [func_retval0+0], %f9; +; ENABLED-NEXT: st.param.f32 [func_retval0], %f9; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: ff( @@ -151,7 +151,7 @@ define float @ff(ptr %p) { ; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4; ; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7; ; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5; -; DISABLED-NEXT: st.param.f32 [func_retval0+0], %f9; +; DISABLED-NEXT: st.param.f32 [func_retval0], %f9; ; DISABLED-NEXT: ret; %p.1 = getelementptr float, ptr %p, i32 1 %p.2 = getelementptr float, ptr %p, i32 2 diff --git a/llvm/test/CodeGen/NVPTX/activemask.ll b/llvm/test/CodeGen/NVPTX/activemask.ll index 1496b2ebdd44..e1d169d17c60 100644 --- a/llvm/test/CodeGen/NVPTX/activemask.ll +++ b/llvm/test/CodeGen/NVPTX/activemask.ll @@ -6,7 +6,7 @@ declare i32 @llvm.nvvm.activemask() ; CHECK-LABEL: activemask( ; ; CHECK: activemask.b32 %[[REG:.+]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %[[REG]]; +; CHECK-NEXT: st.param.b32 [func_retval0], %[[REG]]; ; CHECK-NEXT: ret; define dso_local i32 @activemask() { entry: @@ -18,7 +18,7 @@ entry: ; ; CHECK: activemask.b32 %[[REG:.+]]; ; CHECK: activemask.b32 %[[REG]]; -; CHECK: .param.b32 [func_retval0+0], %[[REG]]; +; CHECK: .param.b32 [func_retval0], %[[REG]]; ; CHECK-NEXT: ret; define dso_local i32 @convergent(i1 %cond) { entry: diff --git a/llvm/test/CodeGen/NVPTX/addr-mode.ll b/llvm/test/CodeGen/NVPTX/addr-mode.ll index a6a085c0e2e3..ca2a74f7e54a 100644 --- a/llvm/test/CodeGen/NVPTX/addr-mode.ll +++ b/llvm/test/CodeGen/NVPTX/addr-mode.ll @@ -12,7 +12,7 @@ define i32 @test_addr_mode_i64(ptr %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i64 -1 %res = load i32, ptr %addr @@ -28,7 +28,7 @@ define i32 @test_addr_mode_i32(ptr %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i32_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i32 -1 %res = load i32, ptr %addr @@ -44,7 +44,7 @@ define i32 @test_addr_mode_i16(ptr %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i16_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i16 -1 %res = load i32, ptr %addr @@ -60,7 +60,7 @@ define i32 @test_addr_mode_i8(ptr %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i8 -1 %res = load i32, ptr %addr @@ -77,7 +77,7 @@ define i32 @test_addr_mode_i64_large(ptr %x) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 17179869172; ; CHECK-NEXT: ld.u32 %r1, [%rd2]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i64 4294967293 %res = load i32, ptr %addr diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index 5983d71e065d..4bda8049b267 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,7 +10,7 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; store <2 x float> %call, ptr %output, align 8 ; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void @@ -21,7 +21,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { ; %call = tail call <3 x float> @barv3(<3 x float> %input) ; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; ; Make sure we don't load more values than than we need to. ; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; @@ -38,7 +38,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) { ; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.f32 [[ELEMA2:%f[0-9]+]], [retval0+4]; store [2 x float] %call, ptr %output, align 4 ; CHECK: } @@ -52,7 +52,7 @@ define void @test_s2f32({float, float} %input, ptr %output) { ; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.f32 [[ELEMS2:%f[0-9]+]], [retval0+4]; store {float, float} %call, ptr %output, align 4 ; CHECK: } diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 95bca39c73ad..80815b3ca37c 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -37,7 +37,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM70-NEXT: or.b32 %r9, %r5, 4194304; ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fadd( @@ -52,7 +52,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1; ; SM80-NEXT: add.rn.f32 %f3, %f2, %f1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fadd( @@ -67,7 +67,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fadd( @@ -78,7 +78,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; ; SM90-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; ; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %3 = fadd bfloat %0, %1 ret bfloat %3 @@ -108,7 +108,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM70-NEXT: or.b32 %r9, %r5, 4194304; ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fsub( @@ -123,7 +123,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1; ; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fsub( @@ -138,7 +138,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fsub( @@ -149,7 +149,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM90-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; ; SM90-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; ; SM90-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %3 = fsub bfloat %0, %1 ret bfloat %3 @@ -199,7 +199,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; } ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r23; +; SM70-NEXT: st.param.b32 [func_retval0], %r23; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_faddx2( @@ -222,7 +222,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-NEXT: add.rn.f32 %f6, %f5, %f4; ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_faddx2( @@ -245,7 +245,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_faddx2( @@ -256,7 +256,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM90-NEXT: ld.param.b32 %r1, [test_faddx2_param_1]; ; SM90-NEXT: ld.param.b32 %r2, [test_faddx2_param_0]; ; SM90-NEXT: add.rn.bf16x2 %r3, %r2, %r1; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %r @@ -306,7 +306,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; } ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r23; +; SM70-NEXT: st.param.b32 [func_retval0], %r23; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fsubx2( @@ -329,7 +329,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4; ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fsubx2( @@ -352,7 +352,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fsubx2( @@ -363,7 +363,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM90-NEXT: ld.param.b32 %r1, [test_fsubx2_param_1]; ; SM90-NEXT: ld.param.b32 %r2, [test_fsubx2_param_0]; ; SM90-NEXT: sub.rn.bf16x2 %r3, %r2, %r1; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = fsub <2 x bfloat> %a, %b ret <2 x bfloat> %r @@ -413,7 +413,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; } ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r23; +; SM70-NEXT: st.param.b32 [func_retval0], %r23; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fmulx2( @@ -436,7 +436,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4; ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fmulx2( @@ -459,7 +459,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fmulx2( @@ -470,7 +470,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM90-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; ; SM90-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; ; SM90-NEXT: mul.rn.bf16x2 %r3, %r2, %r1; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = fmul <2 x bfloat> %a, %b ret <2 x bfloat> %r @@ -520,7 +520,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; } ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r23; +; SM70-NEXT: st.param.b32 [func_retval0], %r23; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fdiv( @@ -543,7 +543,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-NEXT: div.rn.f32 %f6, %f5, %f4; ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fdiv( @@ -566,7 +566,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fdiv( @@ -589,7 +589,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM90-NEXT: div.rn.f32 %f6, %f5, %f4; ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6; ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = fdiv <2 x bfloat> %a, %b ret <2 x bfloat> %r @@ -602,7 +602,7 @@ define bfloat @test_extract_0(<2 x bfloat> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_0_param_0]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x bfloat> %a, i32 0 ret bfloat %e @@ -615,7 +615,7 @@ define bfloat @test_extract_1(<2 x bfloat> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_1_param_0+2]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x bfloat> %a, i32 1 ret bfloat %e @@ -631,7 +631,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM70-NEXT: ld.param.u16 %r1, [test_fpext_float_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; -; SM70-NEXT: st.param.f32 [func_retval0+0], %f1; +; SM70-NEXT: st.param.f32 [func_retval0], %f1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fpext_float( @@ -642,7 +642,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; -; SM80-NEXT: st.param.f32 [func_retval0+0], %f1; +; SM80-NEXT: st.param.f32 [func_retval0], %f1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fpext_float( @@ -653,7 +653,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; -; SM80-FTZ-NEXT: st.param.f32 [func_retval0+0], %f1; +; SM80-FTZ-NEXT: st.param.f32 [func_retval0], %f1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fpext_float( @@ -664,7 +664,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM90-NEXT: cvt.f32.bf16 %f1, %rs1; -; SM90-NEXT: st.param.f32 [func_retval0+0], %f1; +; SM90-NEXT: st.param.f32 [func_retval0], %f1; ; SM90-NEXT: ret; %r = fpext bfloat %a to float ret float %r @@ -688,7 +688,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM70-NEXT: or.b32 %r5, %r1, 4194304; ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fptrunc_float( @@ -699,7 +699,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fptrunc_float( @@ -710,7 +710,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fptrunc_float( @@ -721,7 +721,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; ; SM90-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; %r = fptrunc float %a to bfloat ret bfloat %r @@ -748,7 +748,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM70-NEXT: or.b32 %r7, %r3, 4194304; ; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fadd_imm_1( @@ -761,7 +761,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; ; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fadd_imm_1( @@ -774,7 +774,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f3F800000; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fadd_imm_1( @@ -785,7 +785,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; ; SM90-NEXT: mov.b16 %rs2, 0x3F80; ; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %r = fadd bfloat %a, 1.0 ret bfloat %r @@ -805,7 +805,7 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat % ; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2]; ; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3]; ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; %cc = fcmp olt double %a, %b %r = select i1 %cc, bfloat %c, bfloat %d @@ -851,7 +851,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: cvt.u32.u16 %r19, %rs1; ; SM70-NEXT: shl.b32 %r20, %r19, 16; ; SM70-NEXT: mov.b32 %f8, %r20; -; SM70-NEXT: st.param.v4.f32 [func_retval0+0], {%f8, %f7, %f6, %f5}; +; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; ; SM70-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM70-NEXT: ret; ; @@ -877,7 +877,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: cvt.f32.bf16 %f6, %rs3; ; SM80-NEXT: cvt.f32.bf16 %f7, %rs2; ; SM80-NEXT: cvt.f32.bf16 %f8, %rs1; -; SM80-NEXT: st.param.v4.f32 [func_retval0+0], {%f8, %f7, %f6, %f5}; +; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; ; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM80-NEXT: ret; ; @@ -903,7 +903,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1; -; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+0], {%f8, %f7, %f6, %f5}; +; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; ; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM80-FTZ-NEXT: ret; ; @@ -929,7 +929,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: cvt.f32.bf16 %f6, %rs3; ; SM90-NEXT: cvt.f32.bf16 %f7, %rs2; ; SM90-NEXT: cvt.f32.bf16 %f8, %rs1; -; SM90-NEXT: st.param.v4.f32 [func_retval0+0], {%f8, %f7, %f6, %f5}; +; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; ; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM90-NEXT: ret; %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16 @@ -950,7 +950,7 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %f1; ; SM70-NEXT: cvt.u32.u16 %r3, %rs1; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM70-NEXT: st.param.b32 [func_retval0], %r3; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fptosi_i16( @@ -964,7 +964,7 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; ; SM80-NEXT: cvt.rzi.s16.f32 %rs2, %f1; ; SM80-NEXT: cvt.u32.u16 %r1, %rs2; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM80-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fptosi_i16( @@ -978,7 +978,7 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %f1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fptosi_i16( @@ -990,7 +990,7 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM90-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; ; SM90-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; ; SM90-NEXT: cvt.u32.u16 %r1, %rs2; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM90-NEXT: st.param.b32 [func_retval0], %r1; ; SM90-NEXT: ret; %r = fptosi bfloat %a to i16 ret i16 %r @@ -1009,7 +1009,7 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %f1; ; SM70-NEXT: cvt.u32.u16 %r3, %rs1; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM70-NEXT: st.param.b32 [func_retval0], %r3; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fptoui_i16( @@ -1023,7 +1023,7 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; ; SM80-NEXT: cvt.rzi.u16.f32 %rs2, %f1; ; SM80-NEXT: cvt.u32.u16 %r1, %rs2; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM80-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fptoui_i16( @@ -1037,7 +1037,7 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %f1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fptoui_i16( @@ -1049,7 +1049,7 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM90-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; ; SM90-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; ; SM90-NEXT: cvt.u32.u16 %r1, %rs2; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM90-NEXT: st.param.b32 [func_retval0], %r1; ; SM90-NEXT: ret; %r = fptoui bfloat %a to i16 ret i16 %r @@ -1074,7 +1074,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM70-NEXT: or.b32 %r5, %r1, 4194304; ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_sitofp_i16( @@ -1086,7 +1086,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; ; SM80-NEXT: cvt.rn.f32.s16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_sitofp_i16( @@ -1098,7 +1098,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.s16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_sitofp_i16( @@ -1108,7 +1108,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; ; SM90-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; %r = sitofp i16 %a to bfloat ret bfloat %r @@ -1133,7 +1133,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM70-NEXT: or.b32 %r5, %r1, 4194304; ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_uitofp_i8( @@ -1145,7 +1145,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_uitofp_i8( @@ -1157,7 +1157,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_uitofp_i8( @@ -1167,7 +1167,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; %r = uitofp i8 %a to bfloat ret bfloat %r @@ -1195,7 +1195,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM70-NEXT: or.b32 %r6, %r2, 4194304; ; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM70-NEXT: st.param.b16 [func_retval0], %rs3; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_uitofp_i1( @@ -1212,7 +1212,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-NEXT: selp.u32 %r1, 1, 0, %p1; ; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_uitofp_i1( @@ -1229,7 +1229,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: selp.u32 %r1, 1, 0, %p1; ; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_uitofp_i1( @@ -1244,7 +1244,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM90-NEXT: setp.eq.b16 %p1, %rs2, 1; ; SM90-NEXT: selp.u32 %r1, 1, 0, %p1; ; SM90-NEXT: cvt.rn.bf16.u32 %rs3, %r1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %r = uitofp i1 %a to bfloat ret bfloat %r @@ -1269,7 +1269,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM70-NEXT: or.b32 %r5, %r1, 4194304; ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM70-NEXT: st.param.b16 [func_retval0], %rs2; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_uitofp_i16( @@ -1281,7 +1281,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_uitofp_i16( @@ -1293,7 +1293,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_uitofp_i16( @@ -1303,7 +1303,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; %r = uitofp i16 %a to bfloat ret bfloat %r @@ -1328,7 +1328,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM70-NEXT: or.b32 %r6, %r2, 4194304; ; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_uitofp_i32( @@ -1341,7 +1341,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; ; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_uitofp_i32( @@ -1354,7 +1354,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_uitofp_i32( @@ -1365,7 +1365,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; ; SM90-NEXT: cvt.rn.bf16.u32 %rs1, %r1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; %r = uitofp i32 %a to bfloat ret bfloat %r @@ -1391,7 +1391,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM70-NEXT: or.b32 %r5, %r1, 4194304; ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_uitofp_i64( @@ -1404,7 +1404,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; ; SM80-NEXT: cvt.rn.f32.u64 %f1, %rd1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_uitofp_i64( @@ -1417,7 +1417,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u64 %f1, %rd1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_uitofp_i64( @@ -1428,7 +1428,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; ; SM90-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; %r = uitofp i64 %a to bfloat ret bfloat %r @@ -1455,7 +1455,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM70-NEXT: or.b32 %r7, %r3, 4194304; ; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_roundeven( @@ -1468,7 +1468,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; ; SM80-NEXT: cvt.rni.f32.f32 %f2, %f1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_roundeven( @@ -1481,7 +1481,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_roundeven( @@ -1491,7 +1491,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; ; SM90-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs2; +; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; %r = call bfloat @llvm.roundeven.bf16(bfloat %a) ret bfloat %r @@ -1527,7 +1527,7 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM70-NEXT: mov.b32 %f3, %r6; ; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; ; SM70-NEXT: selp.b16 %rs10, %rs8, %rs6, %p5; -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs10; +; SM70-NEXT: st.param.b16 [func_retval0], %rs10; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maximum( @@ -1538,7 +1538,7 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; ; SM80-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; ; SM80-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_maximum( @@ -1549,7 +1549,7 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; ; SM80-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_maximum( @@ -1560,7 +1560,7 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM90-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; ; SM90-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; ; SM90-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %r = call bfloat @llvm.maximum.bf16(bfloat %a, bfloat %b) ret bfloat %r @@ -1590,7 +1590,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM70-NEXT: or.b32 %r9, %r5, 4194304; ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; } -; SM70-NEXT: st.param.b16 [func_retval0+0], %rs1; +; SM70-NEXT: st.param.b16 [func_retval0], %rs1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maxnum( @@ -1601,7 +1601,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; ; SM80-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; ; SM80-NEXT: max.bf16 %rs3, %rs1, %rs2; -; SM80-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_maxnum( @@ -1612,7 +1612,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; ; SM80-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; -; SM80-FTZ-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_maxnum( @@ -1623,7 +1623,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM90-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; ; SM90-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; ; SM90-NEXT: max.bf16 %rs3, %rs1, %rs2; -; SM90-NEXT: st.param.b16 [func_retval0+0], %rs3; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; ; SM90-NEXT: ret; %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) ret bfloat %r @@ -1681,7 +1681,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; ; SM70-NEXT: selp.b16 %rs20, %rs18, %rs16, %p10; ; SM70-NEXT: mov.b32 %r15, {%rs20, %rs12}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r15; +; SM70-NEXT: st.param.b32 [func_retval0], %r15; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maximum_v2( @@ -1692,7 +1692,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; ; SM80-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; ; SM80-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_maximum_v2( @@ -1703,7 +1703,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; ; SM80-FTZ-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_maximum_v2( @@ -1714,7 +1714,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM90-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1]; ; SM90-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0]; ; SM90-NEXT: max.NaN.bf16x2 %r3, %r2, %r1; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r @@ -1764,7 +1764,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2; ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; } ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7}; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r23; +; SM70-NEXT: st.param.b32 [func_retval0], %r23; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maxnum_v2( @@ -1775,7 +1775,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; ; SM80-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; ; SM80-NEXT: max.bf16x2 %r3, %r2, %r1; -; SM80-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_maxnum_v2( @@ -1786,7 +1786,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; ; SM80-FTZ-NEXT: max.bf16x2 %r3, %r2, %r1; -; SM80-FTZ-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_maxnum_v2( @@ -1797,7 +1797,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM90-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1]; ; SM90-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0]; ; SM90-NEXT: max.bf16x2 %r3, %r2, %r1; -; SM90-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; ; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll index f61205eb88fc..a53c90ac6db8 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll @@ -16,7 +16,7 @@ declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0 ; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 { %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) @@ -33,7 +33,7 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 { ; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 { %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 8d40a9ef54dc..925ae4245a4c 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -7,7 +7,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: test_ret_const( ; CHECK: mov.b32 [[T:%r[0-9+]]], 1073758080; -; CHECK: st.param.b32 [func_retval0+0], [[T]]; +; CHECK: st.param.b32 [func_retval0], [[T]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_ret_const() #0 { @@ -30,7 +30,7 @@ define <2 x bfloat> @test_ret_const() #0 { ; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; SM80-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 { @@ -47,7 +47,7 @@ define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 { ; SM80: add.rn.f32 [[FR:%f[0-9]+]], [[FA]], 0f3F800000; ; SM80: cvt.rn.bf16.f32 [[R:%rs[0-9]+]], [[FR]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define bfloat @test_fadd_imm_1(bfloat %a) #0 { @@ -72,7 +72,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; ; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { @@ -97,7 +97,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; ; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { @@ -119,7 +119,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]; ; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; ; CHECK-NEXT: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { @@ -131,7 +131,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_fneg_param_0]; ; CHECK-DAG: xor.b32 [[IHH0:%r[0-9]+]], [[A]], -2147450880; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[IHH0]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[IHH0]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 { %r = fneg <2 x bfloat> %a @@ -175,15 +175,15 @@ declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0 ; CHECK: { ; CHECK-DAG: .param .align 4 .b8 param0[4]; ; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[A]]; -; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: st.param.b32 [param0], [[A]]; +; CHECK-DAG: st.param.b32 [param1], [[B]]; ; CHECK-DAG: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, ; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { @@ -197,7 +197,7 @@ define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] ; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 { @@ -227,7 +227,7 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c ; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 { @@ -255,7 +255,7 @@ define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloa ; ; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; ; CHECK-NEXT: ret; define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 { @@ -276,7 +276,7 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, ; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, <2 x float> %c, <2 x float> %d) #0 { @@ -290,7 +290,7 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { %r = fptrunc <2 x float> %a to <2 x bfloat> @@ -302,7 +302,7 @@ define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.f32.bf16 [[R0:%f[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.f32.bf16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; ; CHECK: ret; define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { %r = fpext <2 x bfloat> %a to <2 x float> @@ -311,7 +311,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { ; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xbf16_to_2xi16_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[A]] +; CHECK: st.param.b32 [func_retval0], [[A]] ; CHECK: ret; define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 { %r = bitcast <2 x bfloat> %a to <2 x i16> @@ -321,7 +321,7 @@ define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 { ; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16( ; CHECK: ld.param.b32 [[R]], [test_bitcast_2xi16_to_2xbf16_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 { %r = bitcast <2 x i16> %a to <2 x bfloat> @@ -362,7 +362,7 @@ declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bf ; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) @@ -375,7 +375,7 @@ define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 { ; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_fmuladd_param_2]; ; ; CHECK: fma.rn.bf16x2 [[RA:%r[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[RA]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[RA]]; ; CHECK: ret; define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) @@ -385,7 +385,7 @@ define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; CHECK-LABEL: test_fabs( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_fabs_param_0]; ; CHECK: and.b32 [[R:%r[0-9]+]], [[A]], 2147450879; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) @@ -407,7 +407,7 @@ define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-DAG: ld.param.b32 [[AF0:%r[0-9]+]], [test_minnum_param_0]; ; CHECK-DAG: ld.param.b32 [[BF0:%r[0-9]+]], [test_minnum_param_1]; ; CHECK-DAG: min.bf16x2 [[RF0:%r[0-9]+]], [[AF0]], [[BF0]]; -; CHECK: st.param.b32 [func_retval0+0], [[RF0]]; +; CHECK: st.param.b32 [func_retval0], [[RF0]]; ; CHECK: ret; define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) @@ -418,7 +418,7 @@ define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-DAG: ld.param.b32 [[AF0:%r[0-9]+]], [test_maxnum_param_0]; ; CHECK-DAG: ld.param.b32 [[BF0:%r[0-9]+]], [test_maxnum_param_1]; ; CHECK-DAG: max.bf16x2 [[RF0:%r[0-9]+]], [[AF0]], [[BF0]]; -; CHECK: st.param.b32 [func_retval0+0], [[RF0]]; +; CHECK: st.param.b32 [func_retval0], [[RF0]]; ; CHECK: ret; define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) @@ -439,7 +439,7 @@ define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) @@ -458,7 +458,7 @@ define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 { ; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) @@ -471,7 +471,7 @@ define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 { ; SM90: cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; ; SM90: cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) @@ -484,7 +484,7 @@ define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 { ; SM90: cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; ; SM90: cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) @@ -498,7 +498,7 @@ define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 { ; CHECK: or.b32 {{.*}}, [[R1]], 1056964608; ; CHECK: and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648; ; CHECK: or.b32 {{.*}}, [[R2]], 1056964608; -; CHECK: st.param.b32 [func_retval0+0], {{.*}}; +; CHECK: st.param.b32 [func_retval0], {{.*}}; ; CHECK: ret; define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 { %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) @@ -526,7 +526,7 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 { ; SM90-DAG: and.b32 [[R1:%r[0-9]+]], [[B]], -2147450880; ; SM90-DAG: and.b32 [[R2:%r[0-9]+]], [[A]], 2147450879; ; SM90-DAG: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index 3f929ec6a75d..461cecf57270 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -16,7 +16,7 @@ define i16 @bswap16(i16 %a) { ; CHECK-NEXT: shl.b16 %rs3, %rs1, 8; ; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %b = tail call i16 @llvm.bswap.i16(i16 %a) ret i16 %b @@ -31,7 +31,7 @@ define i32 @bswap32(i32 %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [bswap32_param_0]; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %b = tail call i32 @llvm.bswap.i32(i32 %a) ret i32 %b @@ -46,7 +46,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [bswapv2i16_param_0]; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a) ret <2 x i16> %b @@ -65,7 +65,7 @@ define i64 @bswap64(i64 %a) { ; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291; ; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2}; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %b = tail call i64 @llvm.bswap.i64(i64 %a) ret i64 %b diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 3fbed871850b..0ce9a58b2e6e 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -45,9 +45,9 @@ entry: store float %3, ptr %arrayidx7, align 4 ; CHECK: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A_REG]] +; CHECK-NEXT: st.param.b64 [param0], %rd[[A_REG]] ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1+0], %rd[[SP_REG]] +; CHECK-NEXT: st.param.b64 [param1], %rd[[SP_REG]] ; CHECK-NEXT: call.uni ; CHECK-NEXT: callee, diff --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll index bd723a296e62..5cf70a6aea5c 100644 --- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll +++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll @@ -14,7 +14,7 @@ target triple = "nvptx64-nvidia-cuda" %complex_half = type { half, half } ; CHECK: .param .align 2 .b8 param2[4]; -; CHECK: st.param.b16 [param2+0], %rs1; +; CHECK: st.param.b16 [param2], %rs1; ; CHECK: st.param.b16 [param2+2], %rs2; ; CHECK: .param .align 2 .b8 retval0[4]; ; CHECK-NEXT: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]); @@ -37,7 +37,7 @@ define internal void @callee(ptr byval(%"class.complex") %byval_arg) { define void @boom() { %fp = call ptr @usefp(ptr @callee) ; CHECK: .param .align 2 .b8 param0[4]; - ; CHECK: st.param.b16 [param0+0], %rs1; + ; CHECK: st.param.b16 [param0], %rs1; ; CHECK: st.param.b16 [param0+2], %rs2; ; CHECK: .callprototype ()_ (.param .align 2 .b8 _[4]); call void %fp(ptr byval(%"class.complex") null) diff --git a/llvm/test/CodeGen/NVPTX/chain-different-as.ll b/llvm/test/CodeGen/NVPTX/chain-different-as.ll index 18d06647cfe0..293281e17dd3 100644 --- a/llvm/test/CodeGen/NVPTX/chain-different-as.ll +++ b/llvm/test/CodeGen/NVPTX/chain-different-as.ll @@ -11,7 +11,7 @@ define i64 @test() nounwind readnone { ; CHECK-NEXT: mov.u64 %rd2, 42; ; CHECK-NEXT: st.u64 [%rd1], %rd2; ; CHECK-NEXT: ld.global.u64 %rd3, [%rd1]; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; %addr0 = inttoptr i64 1 to ptr %addr1 = inttoptr i64 1 to ptr addrspace(1) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 85ae5f0c8f60..f7cc32b962b9 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -47,7 +47,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.u32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0+0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( @@ -87,7 +87,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: cvt.u32.u16 %r2, %rs9; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -132,7 +132,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: mov.u32 %r19, %r8; ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0+0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i16( @@ -147,7 +147,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2]; ; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2; ; SM70-NEXT: cvt.u32.u16 %r1, %rs2; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r1; +; SM70-NEXT: st.param.b32 [func_retval0], %r1; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new @@ -165,7 +165,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; ; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM30-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i32( @@ -178,7 +178,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst ret i32 %new @@ -195,7 +195,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; ; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM30-NEXT: st.param.b64 [func_retval0+0], %rd3; +; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i64( @@ -207,7 +207,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0+0], %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst ret i64 %new diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index 56bfaa14c587..1b22cfde3972 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -14,7 +14,7 @@ define i32 @test1(i32 %n, i32 %m) { ; CHECK-NEXT: ld.param.u32 %r1, [test1_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test1_param_1]; ; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %add = add i32 %n, 1 %mul = mul i32 %add, %m @@ -31,7 +31,7 @@ define i32 @test1_rev(i32 %n, i32 %m) { ; CHECK-NEXT: ld.param.u32 %r1, [test1_rev_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test1_rev_param_1]; ; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %add = add i32 %n, 1 %mul = mul i32 %m, %add @@ -53,7 +53,7 @@ define i32 @test2(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r2, %r4, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %add = add i32 %n, 1 %cond = icmp slt i32 %s, 1 @@ -77,7 +77,7 @@ define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %add = add i32 %n, 1 %cond = icmp slt i32 %s, 1 @@ -101,7 +101,7 @@ define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %add = add i32 %n, 1 %cond = icmp slt i32 %s, 1 @@ -126,7 +126,7 @@ define i32 @test3(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: setp.lt.s32 %p1, %r4, 1; ; CHECK-NEXT: selp.b32 %r5, 1, %r2, %p1; ; CHECK-NEXT: mul.lo.s32 %r6, %r5, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %add = add i32 %n, 3 %cond = icmp slt i32 %s, 1 @@ -152,7 +152,7 @@ define i32 @test4(i32 %a, i32 %b, i32 %c, i1 %p) { ; CHECK-NEXT: ld.param.u32 %r3, [test4_param_2]; ; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: selp.b32 %r5, %r4, %r3, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %mul = mul i32 %a, %b %sel = select i1 %p, i32 %mul, i32 0 @@ -176,7 +176,7 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) { ; CHECK-NEXT: ld.param.u32 %r3, [test4_rev_param_2]; ; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: selp.b32 %r5, %r3, %r4, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %mul = mul i32 %a, %b %sel = select i1 %p, i32 0, i32 %mul diff --git a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll index a88c5637f089..5deafb3ceed7 100644 --- a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll +++ b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll @@ -10,7 +10,7 @@ define <6 x half> @half6() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b16 %rs1, 0x0000; -; CHECK-NEXT: st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b16 [func_retval0+8], {%rs1, %rs1}; ; CHECK-NEXT: ret; ret <6 x half> zeroinitializer @@ -23,7 +23,7 @@ define <10 x half> @half10() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b16 %rs1, 0x0000; -; CHECK-NEXT: st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b16 [func_retval0+16], {%rs1, %rs1}; ; CHECK-NEXT: ret; @@ -37,7 +37,7 @@ define <12 x i8> @byte12() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: ret; @@ -51,7 +51,7 @@ define <20 x i8> @byte20() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1}; diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll index d7e2cede8a99..b1850185f0c7 100644 --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -11,7 +11,7 @@ define i16 @cvt_i16_i32(i32 %x) { ; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}] -; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i32 %x to i16 ret i16 %a @@ -19,7 +19,7 @@ define i16 @cvt_i16_i32(i32 %x) { define i16 @cvt_i16_i64(i64 %x) { ; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}] -; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i64 %x to i16 ret i16 %a @@ -31,7 +31,7 @@ define i16 @cvt_i16_i64(i64 %x) { define i32 @cvt_i32_i16(i16 %x) { ; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}] -; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = zext i16 %x to i32 ret i32 %a @@ -39,7 +39,7 @@ define i32 @cvt_i32_i16(i16 %x) { define i32 @cvt_i32_i64(i64 %x) { ; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}] -; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i64 %x to i32 ret i32 %a @@ -51,7 +51,7 @@ define i32 @cvt_i32_i64(i64 %x) { define i64 @cvt_i64_i16(i16 %x) { ; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]] ; CHECK: ret %a = zext i16 %x to i64 ret i64 %a @@ -59,7 +59,7 @@ define i64 @cvt_i64_i16(i16 %x) { define i64 @cvt_i64_i32(i32 %x) { ; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]] ; CHECK: ret %a = zext i32 %x to i64 ret i64 %a diff --git a/llvm/test/CodeGen/NVPTX/copysign.ll b/llvm/test/CodeGen/NVPTX/copysign.ll index a6aad1c2f012..ba7db68b3977 100644 --- a/llvm/test/CodeGen/NVPTX/copysign.ll +++ b/llvm/test/CodeGen/NVPTX/copysign.ll @@ -14,7 +14,7 @@ define float @fcopysign_f_f(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_f_param_0]; ; CHECK-NEXT: ld.param.f32 %f2, [fcopysign_f_f_param_1]; ; CHECK-NEXT: copysign.f32 %f3, %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %val = call float @llvm.copysign.f32(float %a, float %b) ret float %val @@ -29,7 +29,7 @@ define double @fcopysign_d_d(double %a, double %b) { ; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_d_param_0]; ; CHECK-NEXT: ld.param.f64 %fd2, [fcopysign_d_d_param_1]; ; CHECK-NEXT: copysign.f64 %fd3, %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %val = call double @llvm.copysign.f64(double %a, double %b) ret double %val @@ -51,7 +51,7 @@ define float @fcopysign_f_d(float %a, double %b) { ; CHECK-NEXT: and.b64 %rd3, %rd2, 1; ; CHECK-NEXT: setp.eq.b64 %p1, %rd3, 1; ; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NEXT: ret; %c = fptrunc double %b to float %val = call float @llvm.copysign.f32(float %a, float %c) @@ -74,7 +74,7 @@ define float @fcopysign_f_h(float %a, half %b) { ; CHECK-NEXT: and.b16 %rs3, %rs2, 1; ; CHECK-NEXT: setp.eq.b16 %p1, %rs3, 1; ; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NEXT: ret; %c = fpext half %b to float %val = call float @llvm.copysign.f32(float %a, float %c) @@ -97,7 +97,7 @@ define double @fcopysign_d_f(double %a, float %b) { ; CHECK-NEXT: and.b32 %r3, %r2, 1; ; CHECK-NEXT: setp.eq.b32 %p1, %r3, 1; ; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd4; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %c = fpext float %b to double %val = call double @llvm.copysign.f64(double %a, double %c) @@ -120,7 +120,7 @@ define double @fcopysign_d_h(double %a, half %b) { ; CHECK-NEXT: and.b16 %rs3, %rs2, 1; ; CHECK-NEXT: setp.eq.b16 %p1, %rs3, 1; ; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd4; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %c = fpext half %b to double %val = call double @llvm.copysign.f64(double %a, double %c) diff --git a/llvm/test/CodeGen/NVPTX/dot-product.ll b/llvm/test/CodeGen/NVPTX/dot-product.ll index 36529bbef903..8d3d7238d36f 100644 --- a/llvm/test/CodeGen/NVPTX/dot-product.ll +++ b/llvm/test/CodeGen/NVPTX/dot-product.ll @@ -19,7 +19,7 @@ define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_u32_param_2]; ; CHECK-NEXT: dp4a.u32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp4a.u.u(i32 %a, i32 %b, i32 %c) ret i32 %call @@ -34,7 +34,7 @@ define i32 @test_dp4a_u32imm_u32imm(i32 %c) { ; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0]; ; CHECK-NEXT: mov.b32 %r2, 0; ; CHECK-NEXT: dp4a.u32.u32 %r3, %r2, %r2, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp4a.u.u(i32 0, i32 0, i32 %c) ret i32 %call @@ -50,7 +50,7 @@ define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_s32_param_2]; ; CHECK-NEXT: dp4a.u32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp4a.u.s(i32 %a, i32 %b, i32 %c) ret i32 %call @@ -66,7 +66,7 @@ define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_u32_param_2]; ; CHECK-NEXT: dp4a.s32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp4a.s.u(i32 %a, i32 %b, i32 %c) ret i32 %call @@ -82,7 +82,7 @@ define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_s32_param_2]; ; CHECK-NEXT: dp4a.s32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp4a.s.s(i32 %a, i32 %b, i32 %c) ret i32 %call @@ -103,7 +103,7 @@ define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2]; ; CHECK-NEXT: dp2a.lo.u32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 0, i32 %c) ret i32 %call @@ -119,7 +119,7 @@ define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2]; ; CHECK-NEXT: dp2a.lo.u32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 0, i32 %c) ret i32 %call @@ -135,7 +135,7 @@ define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2]; ; CHECK-NEXT: dp2a.lo.s32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 0, i32 %c) ret i32 %call @@ -151,7 +151,7 @@ define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2]; ; CHECK-NEXT: dp2a.lo.s32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 0, i32 %c) ret i32 %call @@ -167,7 +167,7 @@ define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2]; ; CHECK-NEXT: dp2a.hi.u32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 1, i32 %c) ret i32 %call @@ -183,7 +183,7 @@ define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2]; ; CHECK-NEXT: dp2a.hi.u32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 1, i32 %c) ret i32 %call @@ -199,7 +199,7 @@ define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2]; ; CHECK-NEXT: dp2a.hi.s32.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 1, i32 %c) ret i32 %call @@ -215,7 +215,7 @@ define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2]; ; CHECK-NEXT: dp2a.hi.s32.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 1, i32 %c) ret i32 %call diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index ce81957f2a39..44f39df02490 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -18,7 +18,7 @@ ; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]]; ; CHECK-32-NEXT: { // callseq 0, 0 ; CHECK-32-NEXT: .param .b32 param0; -; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]]; +; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]]; ; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; ; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7; @@ -27,7 +27,7 @@ ; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]]; ; CHECK-64-NEXT: { // callseq 0, 0 ; CHECK-64-NEXT: .param .b64 param0; -; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]]; +; CHECK-64-NEXT: st.param.b64 [param0], %rd[[ALLOCA]]; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll index 358dfef91852..71e1111562f2 100644 --- a/llvm/test/CodeGen/NVPTX/elect.ll +++ b/llvm/test/CodeGen/NVPTX/elect.ll @@ -16,7 +16,7 @@ define {i32, i1} @elect_sync(i32 %mask) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0]; ; CHECK-NEXT: elect.sync %r2|%p1, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; @@ -33,7 +33,7 @@ define {i32, i1} @elect_sync_imm() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: elect.sync %r1|%p1, -1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; @@ -54,7 +54,7 @@ define {i32, i1} @elect_sync_twice(i32 %mask) { ; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0]; ; CHECK-NEXT: elect.sync %r2|%p1, %r1; ; CHECK-NEXT: elect.sync %r3|%p2, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index 367c20749a9f..9b2d514f2a1c 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -16,7 +16,7 @@ define i16 @test_v2i8(i16 %a) { ; CHECK-NEXT: shr.s16 %rs3, %rs1, 8; ; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %v = bitcast i16 %a to <2 x i8> %r0 = extractelement <2 x i8> %v, i64 0 @@ -42,7 +42,7 @@ define i1 @test_v2i8_load(ptr %a) { ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs6, 0; ; CHECK-NEXT: selp.u32 %r1, 1, 0, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %v = load <2 x i8>, ptr %a, align 4 %r0 = extractelement <2 x i8> %v, i64 0 @@ -72,7 +72,7 @@ define i16 @test_v4i8(i32 %a) { ; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4; ; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r6, %rs7; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 @@ -103,7 +103,7 @@ define i32 @test_v4i8_s32(i32 %a) { ; CHECK-NEXT: add.s32 %r6, %r2, %r3; ; CHECK-NEXT: add.s32 %r7, %r4, %r5; ; CHECK-NEXT: add.s32 %r8, %r6, %r7; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 @@ -134,7 +134,7 @@ define i32 @test_v4i8_u32(i32 %a) { ; CHECK-NEXT: add.s32 %r6, %r2, %r3; ; CHECK-NEXT: add.s32 %r7, %r4, %r5; ; CHECK-NEXT: add.s32 %r8, %r6, %r7; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %v = bitcast i32 %a to <4 x i8> %r0 = extractelement <4 x i8> %v, i64 0 @@ -188,7 +188,7 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r13, %rs15; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; +; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %v = bitcast i64 %a to <8 x i8> %r0 = extractelement <8 x i8> %v, i64 0 diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 14e02a49f6e5..f78cfc317262 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -44,7 +44,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: test_ret_const( ; CHECK: mov.b16 [[R:%rs[0-9]+]], 0x3C00; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_ret_const() #0 { ret half 1.0 @@ -59,7 +59,7 @@ define half @test_ret_const() #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fadd(half %a, half %b) #0 { %r = fadd half %a, %b @@ -75,7 +75,7 @@ define half @test_fadd(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { %r = fadd <1 x half> %a, %b @@ -92,7 +92,7 @@ define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fadd_imm_0(half %b) #0 { %r = fadd half 1.0, %b @@ -108,7 +108,7 @@ define half @test_fadd_imm_0(half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fadd_imm_1(half %a) #0 { %r = fadd half %a, 1.0 @@ -124,7 +124,7 @@ define half @test_fadd_imm_1(half %a) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fsub(half %a, half %b) #0 { %r = fsub half %a, %b @@ -141,7 +141,7 @@ define half @test_fsub(half %a, half %b) #0 { ; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; ; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_old_fneg(half %a) #0 { %r = fsub half 0.0, %a @@ -153,7 +153,7 @@ define half @test_old_fneg(half %a) #0 { ; CHECK-F16-NOFTZ-NEXT: neg.f16 [[R:%rs[0-9]+]], [[A]]; ; CHECK-F16-FTZ-NEXT: neg.ftz.f16 [[R:%rs[0-9]+]], [[A]]; ; CHECK-NOF16-NEXT: xor.b16 [[R:%rs[0-9]+]], [[A]], -32768; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fneg(half %a) #0 { %r = fneg half %a @@ -169,7 +169,7 @@ define half @test_fneg(half %a) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] ; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fmul(half %a, half %b) #0 { %r = fmul half %a, %b @@ -186,7 +186,7 @@ define half @test_fmul(half %a, half %b) #0 { ; CHECK-F16-FTZ-DAG: cvt.ftz.f32.f16 [[F1:%f[0-9]+]], [[B]]; ; CHECK-F16-FTZ-NEXT: div.rn.ftz.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]]; ; CHECK-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[FR]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_fdiv(half %a, half %b) #0 { %r = fdiv half %a, %b @@ -211,7 +211,7 @@ define half @test_fdiv(half %a, half %b) #0 { ; CHECK-NEXT: testp.infinite.f32 [[ISBINF:%p[0-9]+]], [[FB]]; ; CHECK-NEXT: selp.f32 [[RESULT:%f[0-9]+]], [[FA]], [[RF]], [[ISBINF]]; ; CHECK-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RESULT]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_frem(half %a, half %b) #0 { %r = frem half %a, %b @@ -231,7 +231,7 @@ define void @test_store(half %a, ptr %b) #0 { ; CHECK-LABEL: test_load( ; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; ; CHECK-NEXT: ld.b16 [[R:%rs[0-9]+]], [%[[PTR]]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_load(ptr %a) #0 { %r = load half, ptr %a @@ -260,8 +260,8 @@ declare half @test_callee(half %a, half %b) #0 ; CHECK: { ; CHECK-DAG: .param .align 2 .b8 param0[2]; ; CHECK-DAG: .param .align 2 .b8 param1[2]; -; CHECK-DAG: st.param.b16 [param0+0], [[A]]; -; CHECK-DAG: st.param.b16 [param1+0], [[B]]; +; CHECK-DAG: st.param.b16 [param0], [[A]]; +; CHECK-DAG: st.param.b16 [param1], [[B]]; ; CHECK-DAG: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -269,9 +269,9 @@ declare half @test_callee(half %a, half %b) #0 ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_call(half %a, half %b) #0 { %r = call half @test_callee(half %a, half %b) @@ -284,8 +284,8 @@ define half @test_call(half %a, half %b) #0 { ; CHECK: { ; CHECK-DAG: .param .align 2 .b8 param0[2]; ; CHECK-DAG: .param .align 2 .b8 param1[2]; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: st.param.b16 [param0], [[B]]; +; CHECK-DAG: st.param.b16 [param1], [[A]]; ; CHECK-DAG: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -293,9 +293,9 @@ define half @test_call(half %a, half %b) #0 { ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_call_flipped(half %a, half %b) #0 { %r = call half @test_callee(half %b, half %a) @@ -308,8 +308,8 @@ define half @test_call_flipped(half %a, half %b) #0 { ; CHECK: { ; CHECK-DAG: .param .align 2 .b8 param0[2]; ; CHECK-DAG: .param .align 2 .b8 param1[2]; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: st.param.b16 [param0], [[B]]; +; CHECK-DAG: st.param.b16 [param1], [[A]]; ; CHECK-DAG: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -317,9 +317,9 @@ define half @test_call_flipped(half %a, half %b) #0 { ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_tailcall_flipped(half %a, half %b) #0 { %r = tail call half @test_callee(half %b, half %a) @@ -331,7 +331,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 { ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_select_param_1]; ; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b16 [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_select(half %a, half %b, i1 zeroext %c) #0 { %r = select i1 %c, half %a, half %b @@ -348,7 +348,7 @@ define half @test_select(half %a, half %b, i1 zeroext %c) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; ; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] ; CHECK: selp.b16 [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { %cc = fcmp une half %c, %d @@ -367,7 +367,7 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; ; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] ; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.f32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { %cc = fcmp une half %c, %d @@ -383,7 +383,7 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { ; CHECK-F16-FTZ-DAG: setp.neu.ftz.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1]; ; CHECK-NEXT: selp.b16 [[R:%rs[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { %cc = fcmp une float %c, %d @@ -400,7 +400,7 @@ define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_une(half %a, half %b) #0 { %r = fcmp une half %a, %b @@ -416,7 +416,7 @@ define i1 @test_fcmp_une(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ueq(half %a, half %b) #0 { %r = fcmp ueq half %a, %b @@ -432,7 +432,7 @@ define i1 @test_fcmp_ueq(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ugt(half %a, half %b) #0 { %r = fcmp ugt half %a, %b @@ -448,7 +448,7 @@ define i1 @test_fcmp_ugt(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_uge(half %a, half %b) #0 { %r = fcmp uge half %a, %b @@ -464,7 +464,7 @@ define i1 @test_fcmp_uge(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ult(half %a, half %b) #0 { %r = fcmp ult half %a, %b @@ -480,7 +480,7 @@ define i1 @test_fcmp_ult(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ule(half %a, half %b) #0 { %r = fcmp ule half %a, %b @@ -497,7 +497,7 @@ define i1 @test_fcmp_ule(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_uno(half %a, half %b) #0 { %r = fcmp uno half %a, %b @@ -513,7 +513,7 @@ define i1 @test_fcmp_uno(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_one(half %a, half %b) #0 { %r = fcmp one half %a, %b @@ -529,7 +529,7 @@ define i1 @test_fcmp_one(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_oeq(half %a, half %b) #0 { %r = fcmp oeq half %a, %b @@ -545,7 +545,7 @@ define i1 @test_fcmp_oeq(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ogt(half %a, half %b) #0 { %r = fcmp ogt half %a, %b @@ -561,7 +561,7 @@ define i1 @test_fcmp_ogt(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_oge(half %a, half %b) #0 { %r = fcmp oge half %a, %b @@ -577,7 +577,7 @@ define i1 @test_fcmp_oge(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_olt(half %a, half %b) #0 { %r = fcmp olt half %a, %b @@ -593,7 +593,7 @@ define i1 @test_fcmp_olt(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ole(half %a, half %b) #0 { %r = fcmp ole half %a, %b @@ -609,7 +609,7 @@ define i1 @test_fcmp_ole(half %a, half %b) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i1 @test_fcmp_ord(half %a, half %b) #0 { %r = fcmp ord half %a, %b @@ -649,13 +649,13 @@ else: ; CHECK: mov.u16 [[R:%rs[0-9]+]], [[AB:%rs[0-9]+]]; ; CHECK: ld.b16 [[AB:%rs[0-9]+]], [%[[P1]]]; ; CHECK: { -; CHECK: st.param.b64 [param0+0], %[[P1]]; +; CHECK: st.param.b64 [param0], %[[P1]]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_dummy ; CHECK: } ; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1; ; CHECK: @[[PRED]] bra [[LOOP]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_phi(ptr %p1) #0 { entry: @@ -674,7 +674,7 @@ declare i1 @test_dummy(ptr %p1) #0 ; CHECK-LABEL: test_fptosi_i32( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fptosi_i32_param_0]; ; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define i32 @test_fptosi_i32(half %a) #0 { %r = fptosi half %a to i32 @@ -684,7 +684,7 @@ define i32 @test_fptosi_i32(half %a) #0 { ; CHECK-LABEL: test_fptosi_i64( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fptosi_i64_param_0]; ; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: st.param.b64 [func_retval0], [[R]]; ; CHECK: ret; define i64 @test_fptosi_i64(half %a) #0 { %r = fptosi half %a to i64 @@ -694,7 +694,7 @@ define i64 @test_fptosi_i64(half %a) #0 { ; CHECK-LABEL: test_fptoui_i32( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fptoui_i32_param_0]; ; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define i32 @test_fptoui_i32(half %a) #0 { %r = fptoui half %a to i32 @@ -704,7 +704,7 @@ define i32 @test_fptoui_i32(half %a) #0 { ; CHECK-LABEL: test_fptoui_i64( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fptoui_i64_param_0]; ; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: st.param.b64 [func_retval0], [[R]]; ; CHECK: ret; define i64 @test_fptoui_i64(half %a) #0 { %r = fptoui half %a to i64 @@ -714,7 +714,7 @@ define i64 @test_fptoui_i64(half %a) #0 { ; CHECK-LABEL: test_uitofp_i32( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; ; CHECK: cvt.rn.f16.u32 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_uitofp_i32(i32 %a) #0 { %r = uitofp i32 %a to half @@ -724,7 +724,7 @@ define half @test_uitofp_i32(i32 %a) #0 { ; CHECK-LABEL: test_uitofp_i64( ; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; ; CHECK: cvt.rn.f16.u64 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_uitofp_i64(i64 %a) #0 { %r = uitofp i64 %a to half @@ -734,7 +734,7 @@ define half @test_uitofp_i64(i64 %a) #0 { ; CHECK-LABEL: test_sitofp_i32( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; ; CHECK: cvt.rn.f16.s32 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_sitofp_i32(i32 %a) #0 { %r = sitofp i32 %a to half @@ -744,7 +744,7 @@ define half @test_sitofp_i32(i32 %a) #0 { ; CHECK-LABEL: test_sitofp_i64( ; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; ; CHECK: cvt.rn.f16.s64 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_sitofp_i64(i64 %a) #0 { %r = sitofp i64 %a to half @@ -761,7 +761,7 @@ define half @test_sitofp_i64(i64 %a) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] ; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { %c = uitofp i32 %a to half @@ -779,7 +779,7 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { ; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] ; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; ; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { %c = sitofp i32 %a to half @@ -790,7 +790,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { ; CHECK-LABEL: test_fptrunc_float( ; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_fptrunc_float(float %a) #0 { %r = fptrunc float %a to half @@ -800,7 +800,7 @@ define half @test_fptrunc_float(float %a) #0 { ; CHECK-LABEL: test_fptrunc_double( ; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; ; CHECK: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_fptrunc_double(double %a) #0 { %r = fptrunc double %a to half @@ -811,7 +811,7 @@ define half @test_fptrunc_double(double %a) #0 { ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_float_param_0]; ; CHECK-NOFTZ: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[R:%f[0-9]+]], [[A]]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK: st.param.f32 [func_retval0], [[R]]; ; CHECK: ret; define float @test_fpext_float(half %a) #0 { %r = fpext half %a to float @@ -821,7 +821,7 @@ define float @test_fpext_float(half %a) #0 { ; CHECK-LABEL: test_fpext_double( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_double_param_0]; ; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; -; CHECK: st.param.f64 [func_retval0+0], [[R]]; +; CHECK: st.param.f64 [func_retval0], [[R]]; ; CHECK: ret; define double @test_fpext_double(half %a) #0 { %r = fpext half %a to double @@ -832,7 +832,7 @@ define double @test_fpext_double(half %a) #0 { ; CHECK-LABEL: test_bitcast_halftoi16( ; CHECK: ld.param.b16 [[AH:%rs[0-9]+]], [test_bitcast_halftoi16_param_0]; ; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AH]] -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define i16 @test_bitcast_halftoi16(half %a) #0 { %r = bitcast half %a to i16 @@ -841,7 +841,7 @@ define i16 @test_bitcast_halftoi16(half %a) #0 { ; CHECK-LABEL: test_bitcast_i16tohalf( ; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; -; CHECK: st.param.b16 [func_retval0+0], [[AS]]; +; CHECK: st.param.b16 [func_retval0], [[AS]]; ; CHECK: ret; define half @test_bitcast_i16tohalf(i16 %a) #0 { %r = bitcast i16 %a to half @@ -880,7 +880,7 @@ declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK-F16-FTZ: sqrt.rn.ftz.f32 [[RF:%f[0-9]+]], [[AF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_sqrt(half %a) #0 { %r = call half @llvm.sqrt.f16(half %a) @@ -900,7 +900,7 @@ define half @test_sqrt(half %a) #0 { ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_sin(half %a) #0 #1 { %r = call half @llvm.sin.f16(half %a) @@ -913,7 +913,7 @@ define half @test_sin(half %a) #0 #1 { ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_cos(half %a) #0 #1 { %r = call half @llvm.cos.f16(half %a) @@ -973,7 +973,7 @@ define half @test_cos(half %a) #0 #1 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] ; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret define half @test_fma(half %a, half %b, half %c) #0 { %r = call half @llvm.fma.f16(half %a, half %b, half %c) @@ -987,7 +987,7 @@ define half @test_fma(half %a, half %b, half %c) #0 { ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK-F16-FTZ: abs.ftz.f32 [[RF:%f[0-9]+]], [[AF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_fabs(half %a) #0 { %r = call half @llvm.fabs.f16(half %a) @@ -1004,7 +1004,7 @@ define half @test_fabs(half %a) #0 { ; CHECK-F16-FTZ-DAG: cvt.ftz.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-F16-FTZ: min.ftz.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_minnum(half %a, half %b) #0 { %r = call half @llvm.minnum.f16(half %a, half %b) @@ -1021,7 +1021,7 @@ define half @test_minnum(half %a, half %b) #0 { ; CHECK-F16-FTZ-DAG: cvt.ftz.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-F16-FTZ: max.ftz.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_maxnum(half %a, half %b) #0 { %r = call half @llvm.maxnum.f16(half %a, half %b) @@ -1034,7 +1034,7 @@ define half @test_maxnum(half %a, half %b) #0 { ; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767; ; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BH]], -32768; ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; -; CHECK: st.param.b16 [func_retval0+0], [[RX]]; +; CHECK: st.param.b16 [func_retval0], [[RX]]; ; CHECK: ret; define half @test_copysign(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) @@ -1049,7 +1049,7 @@ define half @test_copysign(half %a, half %b) #0 { ; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; ; CHECK-DAG: mov.b32 {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]]; ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: st.param.b16 [func_retval0+0], [[RX]]; +; CHECK: st.param.b16 [func_retval0], [[RX]]; ; CHECK: ret; define half @test_copysign_f32(half %a, float %b) #0 { %tb = fptrunc float %b to half @@ -1066,7 +1066,7 @@ define half @test_copysign_f32(half %a, float %b) #0 { ; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48; ; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]]; ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: st.param.b16 [func_retval0+0], [[RX]]; +; CHECK: st.param.b16 [func_retval0], [[RX]]; ; CHECK: ret; define half @test_copysign_f64(half %a, double %b) #0 { %tb = fptrunc double %b to half @@ -1082,7 +1082,7 @@ define half @test_copysign_f64(half %a, double %b) #0 { ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; ; CHECK-NOFTZ: cvt.f32.f16 [[XR:%f[0-9]+]], [[RX]]; ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]]; -; CHECK: st.param.f32 [func_retval0+0], [[XR]]; +; CHECK: st.param.f32 [func_retval0], [[XR]]; ; CHECK: ret; define float @test_copysign_extended(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) @@ -1093,7 +1093,7 @@ define float @test_copysign_extended(half %a, half %b) #0 { ; CHECK-LABEL: test_floor( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_floor_param_0]; ; CHECK: cvt.rmi.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_floor(half %a) #0 { %r = call half @llvm.floor.f16(half %a) @@ -1103,7 +1103,7 @@ define half @test_floor(half %a) #0 { ; CHECK-LABEL: test_ceil( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_ceil_param_0]; ; CHECK: cvt.rpi.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_ceil(half %a) #0 { %r = call half @llvm.ceil.f16(half %a) @@ -1113,7 +1113,7 @@ define half @test_ceil(half %a) #0 { ; CHECK-LABEL: test_trunc( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_trunc_param_0]; ; CHECK: cvt.rzi.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_trunc(half %a) #0 { %r = call half @llvm.trunc.f16(half %a) @@ -1123,7 +1123,7 @@ define half @test_trunc(half %a) #0 { ; CHECK-LABEL: test_rint( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_rint_param_0]; ; CHECK: cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_rint(half %a) #0 { %r = call half @llvm.rint.f16(half %a) @@ -1133,7 +1133,7 @@ define half @test_rint(half %a) #0 { ; CHECK-LABEL: test_nearbyint( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_nearbyint_param_0]; ; CHECK: cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_nearbyint(half %a) #0 { %r = call half @llvm.nearbyint.f16(half %a) @@ -1143,7 +1143,7 @@ define half @test_nearbyint(half %a) #0 { ; CHECK-LABEL: test_roundeven( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_roundeven_param_0]; ; CHECK: cvt.rni.f16.f16 [[R:%rs[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_roundeven(half %a) #0 { %r = call half @llvm.roundeven.f16(half %a) @@ -1155,7 +1155,7 @@ define half @test_roundeven(half %a) #0 { ; check the use of sign mask and 0.5 to implement round ; CHECK: and.b32 [[R:%r[0-9]+]], {{.*}}, -2147483648; ; CHECK: or.b32 {{.*}}, [[R]], 1056964608; -; CHECK: st.param.b16 [func_retval0+0], {{.*}}; +; CHECK: st.param.b16 [func_retval0], {{.*}}; ; CHECK: ret; define half @test_round(half %a) #0 { %r = call half @llvm.round.f16(half %a) @@ -1173,7 +1173,7 @@ define half @test_round(half %a) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] ; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_fmuladd(half %a, half %b, half %c) #0 { %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index b41f63b783d3..b11c69e064c4 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -32,7 +32,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: test_ret_const( ; CHECK: mov.b32 [[R:%r[0-9+]]], 1073757184; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_ret_const() #0 { ret <2 x half> @@ -41,7 +41,7 @@ define <2 x half> @test_ret_const() #0 { ; CHECK-LABEL: test_extract_0( ; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_extract_0_param_0]; ; CHECK: mov.b32 {[[R:%rs[0-9]+]], tmp}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_extract_0(<2 x half> %a) #0 { %e = extractelement <2 x half> %a, i32 0 @@ -51,7 +51,7 @@ define half @test_extract_0(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_1( ; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_extract_1_param_0]; ; CHECK: mov.b32 {tmp, [[R:%rs[0-9]+]]}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_extract_1(<2 x half> %a) #0 { %e = extractelement <2 x half> %a, i32 1 @@ -64,7 +64,7 @@ define half @test_extract_1(<2 x half> %a) #0 { ; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; ; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; ; CHECK: selp.b16 [[R:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { %e = extractelement <2 x half> %a, i64 %idx @@ -89,7 +89,7 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { %r = fadd <2 x half> %a, %b @@ -112,7 +112,7 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { %r = fadd <2 x half> , %a @@ -134,7 +134,7 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { %r = fadd <2 x half> %a, @@ -159,7 +159,7 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { %r = fsub <2 x half> %a, %b @@ -182,7 +182,7 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fneg(<2 x half> %a) #0 { %r = fsub <2 x half> , %a @@ -206,7 +206,7 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { %r = fmul <2 x half> %a, %b @@ -227,7 +227,7 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]; ; CHECK-NEXT: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { %r = fdiv <2 x half> %a, %b @@ -265,7 +265,7 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; -- merge into f16x2 and return it. ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { %r = frem <2 x half> %a, %b @@ -333,15 +333,15 @@ declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 ; CHECK: { ; CHECK-DAG: .param .align 4 .b8 param0[4]; ; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[A]]; -; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: st.param.b32 [param0], [[A]]; +; CHECK-DAG: st.param.b32 [param1], [[B]]; ; CHECK-DAG: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, ; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) @@ -354,15 +354,15 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { ; CHECK: { ; CHECK-DAG: .param .align 4 .b8 param0[4]; ; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: st.param.b32 [param0], [[B]]; +; CHECK-DAG: st.param.b32 [param1], [[A]]; ; CHECK-DAG: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, ; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) @@ -375,15 +375,15 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK: { ; CHECK-DAG: .param .align 4 .b8 param0[4]; ; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: st.param.b32 [param0], [[B]]; +; CHECK-DAG: st.param.b32 [param1], [[A]]; ; CHECK-DAG: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_callee, ; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) @@ -396,7 +396,7 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] ; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { %r = select i1 %c, <2 x half> %a, <2 x half> %b @@ -425,7 +425,7 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { ; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { %cc = fcmp une <2 x half> %c, %d @@ -451,7 +451,7 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; ; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; ; CHECK-NEXT: ret; define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, <2 x half> %c, <2 x half> %d) #0 { @@ -472,7 +472,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, <2 x float> %c, <2 x float> %d) #0 { @@ -494,7 +494,7 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -516,7 +516,7 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -538,7 +538,7 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -560,7 +560,7 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -582,7 +582,7 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -604,7 +604,7 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -627,7 +627,7 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -649,7 +649,7 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -671,7 +671,7 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -693,7 +693,7 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -715,7 +715,7 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -737,7 +737,7 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -759,7 +759,7 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -781,7 +781,7 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; +; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; ; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; @@ -795,7 +795,7 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]} ; CHECK: ret; define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { %r = fptosi <2 x half> %a to <2 x i32> @@ -807,7 +807,7 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]} ; CHECK: ret; define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { %r = fptosi <2 x half> %a to <2 x i64> @@ -819,7 +819,7 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]} ; CHECK: ret; define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { %r = fptoui <2 x half> %a to <2 x i32> @@ -831,7 +831,7 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]} ; CHECK: ret; define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { %r = fptoui <2 x half> %a to <2 x i64> @@ -843,7 +843,7 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rn.f16.u32 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.u32 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { %r = uitofp <2 x i32> %a to <2 x half> @@ -855,7 +855,7 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-DAG: cvt.rn.f16.u64 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.u64 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { %r = uitofp <2 x i64> %a to <2 x half> @@ -867,7 +867,7 @@ define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { ; CHECK-DAG: cvt.rn.f16.s32 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.s32 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { %r = sitofp <2 x i32> %a to <2 x half> @@ -879,7 +879,7 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-DAG: cvt.rn.f16.s64 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.s64 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { %r = sitofp <2 x i64> %a to <2 x half> @@ -906,7 +906,7 @@ define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { %c = uitofp <2 x i32> %a to <2 x half> @@ -934,7 +934,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { %c = sitofp <2 x i32> %a to <2 x half> @@ -947,7 +947,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { %r = fptrunc <2 x float> %a to <2 x half> @@ -959,7 +959,7 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-DAG: cvt.rn.f16.f64 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.rn.f16.f64 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { %r = fptrunc <2 x double> %a to <2 x half> @@ -971,7 +971,7 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; ; CHECK: ret; define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { %r = fpext <2 x half> %a to <2 x float> @@ -983,7 +983,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; ; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.v2.f64 [func_retval0], {[[R0]], [[R1]]}; ; CHECK: ret; define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { %r = fpext <2 x half> %a to <2 x double> @@ -993,7 +993,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[A]] +; CHECK: st.param.b32 [func_retval0], [[A]] ; CHECK: ret; define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { %r = bitcast <2 x half> %a to <2 x i16> @@ -1002,7 +1002,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { %r = bitcast <2 x i16> %a to <2 x half> @@ -1012,7 +1012,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-LABEL: test_bitcast_float_to_2xhalf( ; CHECK: ld.param.f32 [[AF1:%f[0-9]+]], [test_bitcast_float_to_2xhalf_param_0]; ; CHECK: mov.b32 [[R:%r[0-9]+]], [[AF1]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 { %r = bitcast float %a to <2 x half> @@ -1022,7 +1022,7 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_float( ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xhalf_to_float_param_0]; ; CHECK: mov.b32 [[AF1:%f[0-9]+]], [[R]]; -; CHECK: st.param.f32 [func_retval0+0], [[AF1]]; +; CHECK: st.param.f32 [func_retval0], [[AF1]]; ; CHECK: ret; define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 { %r = bitcast <2 x half> %a to float @@ -1063,7 +1063,7 @@ declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_sqrt(<2 x half> %a) #0 { %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) @@ -1087,7 +1087,7 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_sin(<2 x half> %a) #0 #1 { %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) @@ -1104,7 +1104,7 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_cos(<2 x half> %a) #0 #1 { %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) @@ -1175,7 +1175,7 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -1193,7 +1193,7 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; CHECK-F16: and.b32 [[R:%r[0-9]+]], [[A]], 2147450879; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_fabs(<2 x half> %a) #0 { %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) @@ -1214,7 +1214,7 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) @@ -1235,7 +1235,7 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; ; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) @@ -1257,7 +1257,7 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-DAG: and.b32 [[R0:%r[0-9]+]], [[B]], -2147450880; ; CHECK-F16-DAG: and.b32 [[R1:%r[0-9]+]], [[A]], 2147450879; ; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R1]], [[R0]] -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) @@ -1285,7 +1285,7 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; ; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; ; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]] -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { %tb = fptrunc <2 x float> %b to <2 x half> @@ -1316,7 +1316,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; ; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; ; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { %tb = fptrunc <2 x double> %b to <2 x half> @@ -1343,7 +1343,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-F16-DAG: mov.b32 {[[R3:%rs[0-9]+]], [[R4:%rs[0-9]+]]}, [[R2]] ; CHECK-F16-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R3]] ; CHECK-F16-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R4]] -; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; +; CHECK: st.param.v2.f32 [func_retval0], {[[XR0]], [[XR1]]}; ; CHECK: ret; define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) @@ -1357,7 +1357,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_floor(<2 x half> %a) #0 { %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) @@ -1370,7 +1370,7 @@ define <2 x half> @test_floor(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_ceil(<2 x half> %a) #0 { %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) @@ -1383,7 +1383,7 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_trunc(<2 x half> %a) #0 { %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) @@ -1396,7 +1396,7 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_rint(<2 x half> %a) #0 { %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) @@ -1409,7 +1409,7 @@ define <2 x half> @test_rint(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_nearbyint(<2 x half> %a) #0 { %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) @@ -1422,7 +1422,7 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 { ; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; ; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; ; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_roundeven(<2 x half> %a) #0 { %r = call <2 x half> @llvm.roundeven.f16(<2 x half> %a) @@ -1436,7 +1436,7 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 { ; CHECK: or.b32 {{.*}}, [[R1]], 1056964608; ; CHECK: and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648; ; CHECK: or.b32 {{.*}}, [[R2]], 1056964608; -; CHECK: st.param.b32 [func_retval0+0], {{.*}}; +; CHECK: st.param.b32 [func_retval0], {{.*}}; ; CHECK: ret; define <2 x half> @test_round(<2 x half> %a) #0 { %r = call <2 x half> @llvm.round.f16(<2 x half> %a) @@ -1465,7 +1465,7 @@ define <2 x half> @test_round(<2 x half> %a) #0 { ; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] ; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} ; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll index c2f23124049c..8ad5ab6a2875 100644 --- a/llvm/test/CodeGen/NVPTX/i128-param.ll +++ b/llvm/test/CodeGen/NVPTX/i128-param.ll @@ -30,9 +30,9 @@ start: ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK-NEXT: st.param.v2.b64 [param0+0], {%[[REG0]], %[[REG1]]} + ; CHECK-NEXT: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} ; CHECK: .param .align 16 .b8 param1[16]; - ; CHECK-NEXT: st.param.v2.b64 [param1+0], {%[[REG2]], %[[REG3]]} + ; CHECK-NEXT: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} ; CHECK: } // callseq [[CALLSEQ_ID]] call void @callee(i128 %0, i128 %1, ptr %2) @@ -49,9 +49,9 @@ start: ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v2.b64 [param0+0], {%[[REG0]], %[[REG1]]} + ; CHECK: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} ; CHECK: .param .align 16 .b8 param1[16]; - ; CHECK: st.param.v2.b64 [param1+0], {%[[REG2]], %[[REG3]]} + ; CHECK: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} ; CHECK: } // callseq [[CALLSEQ_ID]] call void @callee(i128 %0, i128 %1, ptr %2) diff --git a/llvm/test/CodeGen/NVPTX/i128-retval.ll b/llvm/test/CodeGen/NVPTX/i128-retval.ll index df173536c297..554c43b52bf0 100644 --- a/llvm/test/CodeGen/NVPTX/i128-retval.ll +++ b/llvm/test/CodeGen/NVPTX/i128-retval.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee( define i128 @callee(i128) { ; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]} + ; CHECK: st.param.v2.b64 [func_retval0], {%[[REG0]], %[[REG1]]} ret i128 %0 } @@ -17,7 +17,7 @@ start: ; CHECK: { // callseq 0, 0 ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), - ; CHECK: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [retval0]; ; CHECK: } // callseq 0 %a = call i128 @callee(i128 %0) diff --git a/llvm/test/CodeGen/NVPTX/i128-struct.ll b/llvm/test/CodeGen/NVPTX/i128-struct.ll index cecfd4f6ce42..d7a00a66bf44 100644 --- a/llvm/test/CodeGen/NVPTX/i128-struct.ll +++ b/llvm/test/CodeGen/NVPTX/i128-struct.ll @@ -8,7 +8,7 @@ define { i128, i128 } @foo(i64 %a, i32 %b) { %3 = insertvalue { i128, i128 } undef, i128 %1, 0 %4 = insertvalue { i128, i128 } %3, i128 %2, 1 - ; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG1:rd[0-9]+]], %[[REG2:rd[0-9]+]]}; + ; CHECK: st.param.v2.b64 [func_retval0], {%[[REG1:rd[0-9]+]], %[[REG2:rd[0-9]+]]}; ; CHECK: st.param.v2.b64 [func_retval0+16], {%[[REG3:rd[0-9]+]], %[[REG4:rd[0-9]+]]}; ret { i128, i128 } %4 } diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 396c29512933..895787d68adf 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -145,7 +145,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: xor.b64 %rd112, %rd110, %rd2; ; CHECK-NEXT: sub.cc.s64 %rd113, %rd111, %rd2; ; CHECK-NEXT: subc.cc.s64 %rd114, %rd112, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd113, %rd114}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd113, %rd114}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -279,7 +279,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: mul.lo.s64 %rd98, %rd3, %rd113; ; CHECK-NEXT: sub.cc.s64 %rd99, %rd41, %rd98; ; CHECK-NEXT: subc.cc.s64 %rd100, %rd42, %rd97; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd99, %rd100}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd99, %rd100}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -299,7 +299,7 @@ define i128 @srem_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: and.b64 %rd7, %rd5, -8589934592; ; CHECK-NEXT: sub.cc.s64 %rd8, %rd1, %rd7; ; CHECK-NEXT: subc.cc.s64 %rd9, %rd2, %rd6; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd9}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd9}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, 8589934592 ret i128 %div @@ -314,7 +314,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0]; ; CHECK-NEXT: and.b64 %rd3, %rd1, 8589934591; ; CHECK-NEXT: mov.u64 %rd4, 0; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, 8589934592 ret i128 %div @@ -456,7 +456,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; ; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; ; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd106, %rd107}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -582,7 +582,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; ; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd105, %rd106}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div @@ -603,7 +603,7 @@ define i128 @sdiv_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: shr.u64 %rd8, %rd5, 33; ; CHECK-NEXT: or.b64 %rd9, %rd8, %rd7; ; CHECK-NEXT: shr.s64 %rd10, %rd6, 33; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd9, %rd10}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd10}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, 8589934592 ret i128 %div @@ -620,7 +620,7 @@ define i128 @udiv_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: shr.u64 %rd4, %rd1, 33; ; CHECK-NEXT: or.b64 %rd5, %rd4, %rd3; ; CHECK-NEXT: shr.u64 %rd6, %rd2, 33; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd6}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, 8589934592 ret i128 %div @@ -636,7 +636,7 @@ define i128 @add_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [add_i128_param_1]; ; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd3; ; CHECK-NEXT: addc.cc.s64 %rd6, %rd2, %rd4; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd6}; ; CHECK-NEXT: ret; %result = add i128 %lhs, %rhs ret i128 %result diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index ce9adfc7aa4f..988438bebea6 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -21,7 +21,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; COMMON-LABEL: test_ret_const( ; COMMON: mov.b32 [[R:%r[0-9+]]], 131073; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_ret_const() #0 { ret <2 x i16> @@ -31,7 +31,7 @@ define <2 x i16> @test_ret_const() #0 { ; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_0_param_0]; ; COMMON: mov.b32 {[[RS:%rs[0-9]+]], tmp}, [[A]]; ; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define i16 @test_extract_0(<2 x i16> %a) #0 { %e = extractelement <2 x i16> %a, i32 0 @@ -42,7 +42,7 @@ define i16 @test_extract_0(<2 x i16> %a) #0 { ; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_1_param_0]; ; COMMON: mov.b32 {tmp, [[RS:%rs[0-9]+]]}, [[A]]; ; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define i16 @test_extract_1(<2 x i16> %a) #0 { %e = extractelement <2 x i16> %a, i32 1 @@ -56,7 +56,7 @@ define i16 @test_extract_1(<2 x i16> %a) #0 { ; COMMON-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; ; COMMON: selp.b16 [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; ; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { %e = extractelement <2 x i16> %a, i64 %idx @@ -75,7 +75,7 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; NO-I16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { %r = add <2 x i16> %a, %b @@ -94,7 +94,7 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { %r = add <2 x i16> , %a @@ -112,7 +112,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { %r = add <2 x i16> %a, @@ -130,7 +130,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; COMMON-DAG: sub.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; COMMON-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { %r = sub <2 x i16> %a, %b @@ -149,7 +149,7 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp sgt <2 x i16> %a, %b @@ -169,7 +169,7 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp ugt <2 x i16> %a, %b @@ -189,7 +189,7 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp sle <2 x i16> %a, %b @@ -209,7 +209,7 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp ule <2 x i16> %a, %b @@ -227,7 +227,7 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: mul.lo.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; COMMON-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { %r = mul <2 x i16> %a, %b @@ -239,7 +239,7 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_param_0]; ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_param_1]; ; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { %r = or <2 x i16> %a, %b @@ -255,7 +255,7 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; ; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; ; COMMON: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_or_computed(i16 %a) { %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 @@ -267,7 +267,7 @@ define <2 x i16> @test_or_computed(i16 %a) { ; COMMON-LABEL: test_or_imm_0( ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_imm_0_param_0]; ; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 { %r = or <2 x i16> , %a @@ -277,7 +277,7 @@ define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 { ; COMMON-LABEL: test_or_imm_1( ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_imm_1_param_0]; ; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { %r = or <2 x i16> %a, @@ -288,7 +288,7 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_param_0]; ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_param_1]; ; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { %r = xor <2 x i16> %a, %b @@ -302,7 +302,7 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; ; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; ; COMMON: xor.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_xor_computed(i16 %a) { %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 @@ -314,7 +314,7 @@ define <2 x i16> @test_xor_computed(i16 %a) { ; COMMON-LABEL: test_xor_imm_0( ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_imm_0_param_0]; ; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 { %r = xor <2 x i16> , %a @@ -324,7 +324,7 @@ define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 { ; COMMON-LABEL: test_xor_imm_1( ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_imm_1_param_0]; ; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { %r = xor <2 x i16> %a, @@ -335,7 +335,7 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_param_0]; ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_param_1]; ; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { %r = and <2 x i16> %a, %b @@ -351,7 +351,7 @@ define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; ; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; ; COMMON: and.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_and_computed(i16 %a) { %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 @@ -363,7 +363,7 @@ define <2 x i16> @test_and_computed(i16 %a) { ; COMMON-LABEL: test_and_imm_0( ; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_imm_0_param_0]; ; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 { %r = and <2 x i16> , %a @@ -373,7 +373,7 @@ define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 { ; COMMON-LABEL: test_and_imm_1( ; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_imm_1_param_0]; ; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 { %r = and <2 x i16> %a, @@ -441,15 +441,15 @@ declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 ; COMMON: { ; COMMON-DAG: .param .align 4 .b8 param0[4]; ; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0+0], [[A]]; -; COMMON-DAG: st.param.b32 [param1+0], [[B]]; +; COMMON-DAG: st.param.b32 [param0], [[A]]; +; COMMON-DAG: st.param.b32 [param1], [[B]]; ; COMMON-DAG: .param .align 4 .b8 retval0[4]; ; COMMON: call.uni (retval0), ; COMMON-NEXT: test_callee, ; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) @@ -462,15 +462,15 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON: { ; COMMON-DAG: .param .align 4 .b8 param0[4]; ; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0+0], [[B]]; -; COMMON-DAG: st.param.b32 [param1+0], [[A]]; +; COMMON-DAG: st.param.b32 [param0], [[B]]; +; COMMON-DAG: st.param.b32 [param1], [[A]]; ; COMMON-DAG: .param .align 4 .b8 retval0[4]; ; COMMON: call.uni (retval0), ; COMMON-NEXT: test_callee, ; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) @@ -483,15 +483,15 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON: { ; COMMON-DAG: .param .align 4 .b8 param0[4]; ; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0+0], [[B]]; -; COMMON-DAG: st.param.b32 [param1+0], [[A]]; +; COMMON-DAG: st.param.b32 [param0], [[B]]; +; COMMON-DAG: st.param.b32 [param1], [[A]]; ; COMMON-DAG: .param .align 4 .b8 retval0[4]; ; COMMON: call.uni (retval0), ; COMMON-NEXT: test_callee, ; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) @@ -504,7 +504,7 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] ; COMMON-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; COMMON-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { %r = select i1 %c, <2 x i16> %a, <2 x i16> %b @@ -525,7 +525,7 @@ define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { ; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 { %cc = icmp ne <2 x i16> %c, %d @@ -544,7 +544,7 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; COMMON-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] ; COMMON-DAG: selp.b32 [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]]; ; COMMON-DAG: selp.b32 [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]]; -; COMMON-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON-NEXT: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; ; COMMON-NEXT: ret; define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, <2 x i16> %c, <2 x i16> %d) #0 { @@ -565,7 +565,7 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; ; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; ; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; ; COMMON-NEXT: ret; define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, <2 x i32> %c, <2 x i32> %d) #0 { @@ -580,7 +580,7 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, ; COMMON-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A0]]; ; COMMON-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[A1]]; ; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { %r = trunc <2 x i32> %a to <2 x i16> @@ -592,7 +592,7 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { ; COMMON-DAG: cvt.u16.u64 [[R0:%rs[0-9]+]], [[A0]]; ; COMMON-DAG: cvt.u16.u64 [[R1:%rs[0-9]+]], [[A1]]; ; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { %r = trunc <2 x i64> %a to <2 x i16> @@ -604,7 +604,7 @@ define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { ; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; COMMON-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[A0]]; ; COMMON-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[A1]]; -; COMMON-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON-NEXT: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; ; COMMON: ret; define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { %r = zext <2 x i16> %a to <2 x i32> @@ -616,7 +616,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { ; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] ; COMMON-DAG: cvt.u64.u16 [[R0:%rd[0-9]+]], [[A0]]; ; COMMON-DAG: cvt.u64.u16 [[R1:%rd[0-9]+]], [[A1]]; -; COMMON-NEXT: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON-NEXT: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]}; ; COMMON: ret; define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { %r = zext <2 x i16> %a to <2 x i64> @@ -625,7 +625,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { ; COMMON-LABEL: test_bitcast_i32_to_2xi16( ; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0]; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { %r = bitcast i32 %a to <2 x i16> @@ -634,7 +634,7 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { ; COMMON-LABEL: test_bitcast_2xi16_to_i32( ; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0]; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { %r = bitcast <2 x i16> %a to i32 @@ -645,7 +645,7 @@ define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { ; COMMON: ld.param.u16 [[RS1:%rs[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0]; ; COMMON: mov.u16 [[RS2:%rs[0-9]+]], 5; ; COMMON: mov.b32 [[R:%r[0-9]+]], {[[RS1]], [[RS2]]}; -; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: st.param.b32 [func_retval0], [[R]]; ; COMMON: ret; define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { %ins.0 = insertelement <2 x i16> undef, i16 %a, i32 0 @@ -659,7 +659,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { ; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_shufflevector_param_0]; ; COMMON: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]]; ; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[RS1]], [[RS0]]}; -; COMMON: st.param.b32 [func_retval0+0], [[R1]]; +; COMMON: st.param.b32 [func_retval0], [[R1]]; ; COMMON: ret; define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> @@ -671,7 +671,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { ; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_insertelement_param_0]; ; COMMON: { .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; } ; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[R0]], [[B]]}; -; COMMON: st.param.b32 [func_retval0+0], [[R1]]; +; COMMON: st.param.b32 [func_retval0], [[R1]]; ; COMMON: ret; define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { %i = insertelement <2 x i16> %a, i16 %x, i64 1 diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 96a4359d0ec4..5b5662a1eea7 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -18,7 +18,7 @@ define <4 x i8> @test_ret_const() #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, -66911489; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <4 x i8> } @@ -31,7 +31,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %e = extractelement <4 x i8> %a, i32 0 ret i8 %e @@ -45,7 +45,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %e = extractelement <4 x i8> %a, i32 1 ret i8 %e @@ -59,7 +59,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %e = extractelement <4 x i8> %a, i32 2 ret i8 %e @@ -73,7 +73,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %e = extractelement <4 x i8> %a, i32 3 ret i8 %e @@ -91,7 +91,7 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 { ; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; ; CHECK-NEXT: shl.b32 %r3, %r2, 3; ; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %e = extractelement <4 x i8> %a, i64 %idx ret i8 %e @@ -133,7 +133,7 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, %b ret <4 x i8> %r @@ -166,7 +166,7 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: add.s16 %rs8, %rs7, 4; ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; ; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> , %a ret <4 x i8> %r @@ -199,7 +199,7 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: add.s16 %rs8, %rs7, 4; ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; ; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, ret <4 x i8> %r @@ -241,7 +241,7 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = sub <4 x i8> %a, %b ret <4 x i8> %r @@ -283,7 +283,7 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; ; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; +; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -318,7 +318,7 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; ; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -361,7 +361,7 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; ; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; +; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -396,7 +396,7 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; ; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -436,7 +436,7 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; +; CHECK-NEXT: st.param.b32 [func_retval0], %r22; ; CHECK-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c @@ -476,7 +476,7 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; +; CHECK-NEXT: st.param.b32 [func_retval0], %r22; ; CHECK-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c @@ -519,7 +519,7 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; +; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = mul <4 x i8> %a, %b ret <4 x i8> %r @@ -534,7 +534,7 @@ define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r3, [test_or_param_1]; ; CHECK-NEXT: ld.param.u32 %r4, [test_or_param_0]; ; CHECK-NEXT: or.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = or <4 x i8> %a, %b ret <4 x i8> %r @@ -554,7 +554,7 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; ; CHECK-NEXT: or.b32 %r8, %r6, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -570,7 +570,7 @@ define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; ; CHECK-NEXT: or.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = or <4 x i8> , %a ret <4 x i8> %r @@ -584,7 +584,7 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; ; CHECK-NEXT: or.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = or <4 x i8> %a, ret <4 x i8> %r @@ -599,7 +599,7 @@ define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r3, [test_xor_param_1]; ; CHECK-NEXT: ld.param.u32 %r4, [test_xor_param_0]; ; CHECK-NEXT: xor.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = xor <4 x i8> %a, %b ret <4 x i8> %r @@ -619,7 +619,7 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; ; CHECK-NEXT: xor.b32 %r8, %r6, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -635,7 +635,7 @@ define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = xor <4 x i8> , %a ret <4 x i8> %r @@ -649,7 +649,7 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = xor <4 x i8> %a, ret <4 x i8> %r @@ -664,7 +664,7 @@ define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r3, [test_and_param_1]; ; CHECK-NEXT: ld.param.u32 %r4, [test_and_param_0]; ; CHECK-NEXT: and.b32 %r5, %r4, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = and <4 x i8> %a, %b ret <4 x i8> %r @@ -684,7 +684,7 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; ; CHECK-NEXT: and.b32 %r8, %r6, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -700,7 +700,7 @@ define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = and <4 x i8> , %a ret <4 x i8> %r @@ -714,7 +714,7 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 67305985; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = and <4 x i8> %a, ret <4 x i8> %r @@ -828,9 +828,9 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0+0], %r1; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1+0], %r2; +; CHECK-NEXT: st.param.b32 [param1], %r2; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -838,9 +838,9 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) ret <4 x i8> %r @@ -856,9 +856,9 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0+0], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1+0], %r1; +; CHECK-NEXT: st.param.b32 [param1], %r1; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -866,9 +866,9 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) ret <4 x i8> %r @@ -884,9 +884,9 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0+0], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1+0], %r1; +; CHECK-NEXT: st.param.b32 [param1], %r1; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: test_callee, @@ -894,9 +894,9 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: param0, ; CHECK-NEXT: param1 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.b32 %r3, [retval0+0]; +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) ret <4 x i8> %r @@ -916,7 +916,7 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { ; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0]; ; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = select i1 %c, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r @@ -960,7 +960,7 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; ; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; +; CHECK-NEXT: st.param.b32 [func_retval0], %r27; ; CHECK-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b @@ -994,7 +994,7 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, ; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3; ; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2; ; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r22, %r21, %r20, %r19}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19}; ; CHECK-NEXT: ret; <4 x i8> %c, <4 x i8> %d) #0 { %cc = icmp ne <4 x i8> %c, %d @@ -1032,7 +1032,7 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; ; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; +; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { %cc = icmp ne <4 x i32> %c, %d @@ -1051,7 +1051,7 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; ; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; ; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i32> %a to <4 x i8> ret <4 x i8> %r @@ -1073,7 +1073,7 @@ define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; ; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; ; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> ret <4 x i8> %r @@ -1090,7 +1090,7 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 { ; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r5, %r4, %r3, %r2}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r4, %r3, %r2}; ; CHECK-NEXT: ret; %r = zext <4 x i8> %a to <4 x i32> ret <4 x i32> %r @@ -1116,7 +1116,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { ; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: cvt.u64.u32 %rd7, %r5; ; CHECK-NEXT: and.b64 %rd8, %rd7, 255; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2}; ; CHECK-NEXT: ret; %r = zext <4 x i8> %a to <4 x i64> @@ -1130,7 +1130,7 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast i32 %a to <4 x i8> ret <4 x i8> %r @@ -1145,7 +1145,7 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0]; ; CHECK-NEXT: mov.b32 %r1, %f1; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast float %a to <4 x i8> ret <4 x i8> %r @@ -1158,7 +1158,7 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_i32_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to i32 ret i32 %r @@ -1173,7 +1173,7 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_float_param_0]; ; CHECK-NEXT: mov.b32 %f1, %r2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f1; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to float ret float %r @@ -1192,7 +1192,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; ; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; ; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -1212,7 +1212,7 @@ define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 { ; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; ; CHECK-NEXT: // implicit-def: %r3 ; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 291; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> ret <4 x i8> %s @@ -1227,7 +1227,7 @@ define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0]; ; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 9527; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> ret <4 x i8> %s @@ -1245,7 +1245,7 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { ; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; ; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; ; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %i = insertelement <4 x i8> %a, i8 %x, i64 1 ret <4 x i8> %i @@ -1276,7 +1276,7 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; +; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> ret <4 x i8> %r @@ -1307,7 +1307,7 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; +; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> ret <4 x i8> %r diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index ac6c4e262fd6..1799c86deda7 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -27,9 +27,9 @@ define internal i32 @foo() { ; CHECK-NEXT: add.u64 %rd2, %SP, 0; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 1 .b8 param0[1]; -; CHECK-NEXT: st.param.b8 [param0+0], %rs1; +; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1+0], %rd2; +; CHECK-NEXT: st.param.b64 [param1], %rd2; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); ; CHECK-NEXT: call (retval0), @@ -39,9 +39,9 @@ define internal i32 @foo() { ; CHECK-NEXT: param1 ; CHECK-NEXT: ) ; CHECK-NEXT: , prototype_0; -; CHECK-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; entry: %s = alloca %struct.S, align 1 @@ -69,9 +69,9 @@ define internal i32 @bar() { ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0+0], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1+0], %rd3; +; CHECK-NEXT: st.param.b64 [param1], %rd3; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); ; CHECK-NEXT: call (retval0), @@ -81,9 +81,9 @@ define internal i32 @bar() { ; CHECK-NEXT: param1 ; CHECK-NEXT: ) ; CHECK-NEXT: , prototype_1; -; CHECK-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; entry: %s = alloca %struct.U, align 8 diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll index b201fb98f3e6..dbd4f8a55fac 100644 --- a/llvm/test/CodeGen/NVPTX/jump-table.ll +++ b/llvm/test/CodeGen/NVPTX/jump-table.ll @@ -101,7 +101,7 @@ define i32 @test2(i32 %tmp158) { ; CHECK-NEXT: brx.idx %r2, $L_brx_0; ; CHECK-NEXT: $L__BB1_7: // %bb339 ; CHECK-NEXT: mov.b32 %r7, 12; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_5: // %entry ; CHECK-NEXT: setp.eq.s32 %p3, %r1, 1024; @@ -109,27 +109,27 @@ define i32 @test2(i32 %tmp158) { ; CHECK-NEXT: bra.uni $L__BB1_6; ; CHECK-NEXT: $L__BB1_3: // %bb338 ; CHECK-NEXT: mov.b32 %r8, 11; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_10: // %bb342 ; CHECK-NEXT: mov.b32 %r4, 15; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_6: // %bb336 ; CHECK-NEXT: mov.b32 %r9, 10; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_8: // %bb340 ; CHECK-NEXT: mov.b32 %r6, 13; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_9: // %bb341 ; CHECK-NEXT: mov.b32 %r5, 14; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; ; CHECK-NEXT: $L__BB1_11: // %bb343 ; CHECK-NEXT: mov.b32 %r3, 18; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; entry: switch i32 %tmp158, label %bb336 [ diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index dc20441a67a8..47f65ecbcfa6 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -6,7 +6,7 @@ declare <4 x float> @bar() ; CHECK-LABEL: .func foo( define void @foo(ptr %ptr) { ; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [foo_param_0]; -; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0]; ; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index d702ede61add..cac49b49970b 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -8,11 +8,11 @@ ; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; ; PTX32: cvta.local.u32 %SP, %SPL; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; -; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; +; PTX32: st.volatile.u32 [%SP], %r{{[0-9]+}}; ; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; ; PTX64: cvta.local.u64 %SP, %SPL; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; -; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; +; PTX64: st.volatile.u32 [%SP], %r{{[0-9]+}}; define void @foo(i32 %a) { %local = alloca i32, align 4 store volatile i32 %a, ptr %local diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll index 400184aaefb2..e09fb938ef08 100644 --- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll +++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll @@ -26,7 +26,7 @@ define void @alloca_in_explicit_local_as() { ; PTX-LABEL: .visible .func alloca_in_explicit_local_as( %A = alloca i32, addrspace(5) ; CHECK: store i32 0, ptr addrspace(5) {{%.+}} -; PTX: st.local.u32 [%SP+0], {{%r[0-9]+}} +; PTX: st.local.u32 [%SP], {{%r[0-9]+}} ; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr addrspace(5) %A to ptr ; LOWERALLOCAONLY: store i32 0, ptr [[V1]], align 4 store i32 0, ptr addrspace(5) %A diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 33fa3afc94b8..9cfe9192772b 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -43,7 +43,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8]; ; PTX-NEXT: st.u64 [%rd3], %rd4; ; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0]; -; PTX-NEXT: st.u64 [%SP+0], %rd5; +; PTX-NEXT: st.u64 [%SP], %rd5; ; PTX-NEXT: mov.u64 %rd6, gi; ; PTX-NEXT: cvta.global.u64 %rd7, %rd6; ; PTX-NEXT: selp.b64 %rd8, %rd2, %rd7, %p1; @@ -58,7 +58,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: shl.b32 %r8, %r7, 24; ; PTX-NEXT: or.b32 %r9, %r8, %r6; ; PTX-NEXT: or.b32 %r10, %r9, %r4; -; PTX-NEXT: st.param.b32 [func_retval0+0], %r10; +; PTX-NEXT: st.param.b32 [func_retval0], %r10; ; PTX-NEXT: ret; entry: %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17 @@ -147,7 +147,7 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-NEXT: mov.u64 %rd1, escape; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0+0], %rd4; +; PTX-NEXT: st.param.b64 [param0], %rd4; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -156,7 +156,7 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-NEXT: param0 ; PTX-NEXT: ) ; PTX-NEXT: , prototype_0; -; PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; PTX-NEXT: ld.param.b32 %r1, [retval0]; ; PTX-NEXT: } // callseq 0 ; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_escape( @@ -194,11 +194,11 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 ; PTX-NEXT: mov.u64 %rd1, escape3; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0+0], %rd7; +; PTX-NEXT: st.param.b64 [param0], %rd7; ; PTX-NEXT: .param .b64 param1; -; PTX-NEXT: st.param.b64 [param1+0], %rd8; +; PTX-NEXT: st.param.b64 [param1], %rd8; ; PTX-NEXT: .param .b64 param2; -; PTX-NEXT: st.param.b64 [param2+0], %rd5; +; PTX-NEXT: st.param.b64 [param2], %rd5; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); ; PTX-NEXT: call (retval0), @@ -209,7 +209,7 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 ; PTX-NEXT: param2 ; PTX-NEXT: ) ; PTX-NEXT: , prototype_1; -; PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; PTX-NEXT: ld.param.b32 %r2, [retval0]; ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; ; OPT-LABEL: define void @multiple_grid_const_escape( @@ -307,7 +307,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-NEXT: mov.u64 %rd1, escape; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0+0], %rd6; +; PTX-NEXT: st.param.b64 [param0], %rd6; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -316,7 +316,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-NEXT: param0 ; PTX-NEXT: ) ; PTX-NEXT: , prototype_2; -; PTX-NEXT: ld.param.b32 %r3, [retval0+0]; +; PTX-NEXT: ld.param.b32 %r3, [retval0]; ; PTX-NEXT: } // callseq 2 ; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_partial_escape( @@ -356,7 +356,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu ; PTX-NEXT: mov.u64 %rd1, escape; ; PTX-NEXT: { // callseq 3, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0+0], %rd6; +; PTX-NEXT: st.param.b64 [param0], %rd6; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -365,9 +365,9 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu ; PTX-NEXT: param0 ; PTX-NEXT: ) ; PTX-NEXT: , prototype_3; -; PTX-NEXT: ld.param.b32 %r4, [retval0+0]; +; PTX-NEXT: ld.param.b32 %r4, [retval0]; ; PTX-NEXT: } // callseq 3 -; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_partial_escapemem( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { @@ -574,7 +574,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-NEXT: cvta.param.u64 %rd3, %rd2; ; PTX-NEXT: cvt.u32.u64 %r2, %rd3; ; PTX-NEXT: add.s32 %r3, %r1, %r2; -; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_ptrtoint( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index d1bec032ec3a..eba4f273fa70 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -46,18 +46,18 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; PTX-NEXT: mov.u64 %SPL, __local_depot1; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0]; -; PTX-NEXT: st.u64 [%SP+0], %rd1; +; PTX-NEXT: st.u64 [%SP], %rd1; ; PTX-NEXT: add.u64 %rd2, %SP, 0; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0+0], %rd2; +; PTX-NEXT: st.param.b64 [param0], %rd2; ; PTX-NEXT: .param .b64 retval0; ; PTX-NEXT: call.uni (retval0), ; PTX-NEXT: escape, ; PTX-NEXT: ( ; PTX-NEXT: param0 ; PTX-NEXT: ); -; PTX-NEXT: ld.param.b64 %rd3, [retval0+0]; +; PTX-NEXT: ld.param.b64 %rd3, [retval0]; ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; %tmp = call ptr @escape(ptr nonnull align 16 %arg) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index bdd6c9143846..5161e5d02977 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -55,7 +55,7 @@ define float @ceil_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [ceil_float_param_0]; ; CHECK-NEXT: cvt.rpi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.ceil.f32(float %a) ret float %b @@ -69,7 +69,7 @@ define float @ceil_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [ceil_float_ftz_param_0]; ; CHECK-NEXT: cvt.rpi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.ceil.f32(float %a) ret float %b @@ -83,7 +83,7 @@ define double @ceil_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [ceil_double_param_0]; ; CHECK-NEXT: cvt.rpi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.ceil.f64(double %a) ret double %b @@ -99,7 +99,7 @@ define float @floor_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [floor_float_param_0]; ; CHECK-NEXT: cvt.rmi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.floor.f32(float %a) ret float %b @@ -113,7 +113,7 @@ define float @floor_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [floor_float_ftz_param_0]; ; CHECK-NEXT: cvt.rmi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.floor.f32(float %a) ret float %b @@ -127,7 +127,7 @@ define double @floor_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [floor_double_param_0]; ; CHECK-NEXT: cvt.rmi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.floor.f64(double %a) ret double %b @@ -157,7 +157,7 @@ define float @round_float(float %a) { ; CHECK-NEXT: cvt.rzi.f32.f32 %f7, %f1; ; CHECK-NEXT: setp.lt.f32 %p2, %f5, 0f3F000000; ; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f8; +; CHECK-NEXT: st.param.f32 [func_retval0], %f8; ; CHECK-NEXT: ret; %b = call float @llvm.round.f32(float %a) ret float %b @@ -185,7 +185,7 @@ define float @round_float_ftz(float %a) #1 { ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f7, %f1; ; CHECK-NEXT: setp.lt.ftz.f32 %p2, %f5, 0f3F000000; ; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f8; +; CHECK-NEXT: st.param.f32 [func_retval0], %f8; ; CHECK-NEXT: ret; %b = call float @llvm.round.f32(float %a) ret float %b @@ -208,7 +208,7 @@ define double @round_double(double %a) { ; CHECK-NEXT: copysign.f64 %fd6, %fd1, %fd5; ; CHECK-NEXT: setp.gt.f64 %p2, %fd2, 0d4330000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd1, %fd6, %p2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd7; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %b = call double @llvm.round.f64(double %a) ret double %b @@ -224,7 +224,7 @@ define float @nearbyint_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [nearbyint_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.nearbyint.f32(float %a) ret float %b @@ -238,7 +238,7 @@ define float @nearbyint_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [nearbyint_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.nearbyint.f32(float %a) ret float %b @@ -252,7 +252,7 @@ define double @nearbyint_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [nearbyint_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.nearbyint.f64(double %a) ret double %b @@ -268,7 +268,7 @@ define float @rint_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [rint_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.rint.f32(float %a) ret float %b @@ -282,7 +282,7 @@ define float @rint_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [rint_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.rint.f32(float %a) ret float %b @@ -296,7 +296,7 @@ define double @rint_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [rint_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.rint.f64(double %a) ret double %b @@ -312,7 +312,7 @@ define float @roundeven_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [roundeven_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.roundeven.f32(float %a) ret float %b @@ -326,7 +326,7 @@ define float @roundeven_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [roundeven_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.roundeven.f32(float %a) ret float %b @@ -340,7 +340,7 @@ define double @roundeven_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [roundeven_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.roundeven.f64(double %a) ret double %b @@ -356,7 +356,7 @@ define float @trunc_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [trunc_float_param_0]; ; CHECK-NEXT: cvt.rzi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.trunc.f32(float %a) ret float %b @@ -370,7 +370,7 @@ define float @trunc_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [trunc_float_ftz_param_0]; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.trunc.f32(float %a) ret float %b @@ -384,7 +384,7 @@ define double @trunc_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [trunc_double_param_0]; ; CHECK-NEXT: cvt.rzi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.trunc.f64(double %a) ret double %b @@ -400,7 +400,7 @@ define float @abs_float(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [abs_float_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.fabs.f32(float %a) ret float %b @@ -414,7 +414,7 @@ define float @abs_float_ftz(float %a) #1 { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [abs_float_ftz_param_0]; ; CHECK-NEXT: abs.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.fabs.f32(float %a) ret float %b @@ -428,7 +428,7 @@ define double @abs_double(double %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f64 %fd1, [abs_double_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.fabs.f64(double %a) ret double %b @@ -449,7 +449,7 @@ define half @minnum_half(half %a, half %b) { ; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NOF16-NEXT: min.f32 %f3, %f2, %f1; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f3; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minnum_half( @@ -460,7 +460,7 @@ define half @minnum_half(half %a, half %b) { ; CHECK-F16-NEXT: ld.param.b16 %rs1, [minnum_half_param_0]; ; CHECK-F16-NEXT: ld.param.b16 %rs2, [minnum_half_param_1]; ; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2; -; CHECK-F16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minnum_half( @@ -475,7 +475,7 @@ define half @minnum_half(half %a, half %b) { ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-SM80-NOF16-NEXT: min.f32 %f3, %f2, %f1; ; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f3; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.minnum.f16(half %a, half %b) ret half %x @@ -490,7 +490,7 @@ define float @minnum_float(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f1, [minnum_float_param_0]; ; CHECK-NEXT: ld.param.f32 %f2, [minnum_float_param_1]; ; CHECK-NEXT: min.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float %b) ret float %x @@ -504,7 +504,7 @@ define float @minnum_imm1(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [minnum_imm1_param_0]; ; CHECK-NEXT: min.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float 0.0) ret float %x @@ -518,7 +518,7 @@ define float @minnum_imm2(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [minnum_imm2_param_0]; ; CHECK-NEXT: min.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float 0.0, float %a) ret float %x @@ -533,7 +533,7 @@ define float @minnum_float_ftz(float %a, float %b) #1 { ; CHECK-NEXT: ld.param.f32 %f1, [minnum_float_ftz_param_0]; ; CHECK-NEXT: ld.param.f32 %f2, [minnum_float_ftz_param_1]; ; CHECK-NEXT: min.ftz.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float %b) ret float %x @@ -548,7 +548,7 @@ define double @minnum_double(double %a, double %b) { ; CHECK-NEXT: ld.param.f64 %fd1, [minnum_double_param_0]; ; CHECK-NEXT: ld.param.f64 %fd2, [minnum_double_param_1]; ; CHECK-NEXT: min.f64 %fd3, %fd1, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %x = call double @llvm.minnum.f64(double %a, double %b) ret double %x @@ -575,7 +575,7 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: min.f32 %f6, %f5, %f4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; ; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minnum_v2half( @@ -586,7 +586,7 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-F16-NEXT: ld.param.b32 %r1, [minnum_v2half_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r2, [minnum_v2half_param_0]; ; CHECK-F16-NEXT: min.f16x2 %r3, %r2, %r1; -; CHECK-F16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minnum_v2half( @@ -609,7 +609,7 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: min.f32 %f6, %f5, %f4; ; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; ; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -640,7 +640,7 @@ define half @minimum_half(half %a, half %b) { ; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs9; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_half( @@ -651,7 +651,7 @@ define half @minimum_half(half %a, half %b) { ; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0]; ; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1]; ; CHECK-F16-NEXT: min.NaN.f16 %rs3, %rs1, %rs2; -; CHECK-F16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_half( @@ -676,7 +676,7 @@ define half @minimum_half(half %a, half %b) { ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs9; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.minimum.f16(half %a, half %b) ret half %x @@ -703,7 +703,7 @@ define float @minimum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f7; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float( @@ -714,7 +714,7 @@ define float @minimum_float(float %a, float %b) { ; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_float_param_0]; ; CHECK-F16-NEXT: ld.param.f32 %f2, [minimum_float_param_1]; ; CHECK-F16-NEXT: min.NaN.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_float( @@ -725,7 +725,7 @@ define float @minimum_float(float %a, float %b) { ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_param_1]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float %b) ret float %x @@ -748,7 +748,7 @@ define float @minimum_imm1(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f4, %f1, %f3, %p2; ; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f5, %f4, %f3, %p3; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f5; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f5; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm1( @@ -758,7 +758,7 @@ define float @minimum_imm1(float %a) { ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_imm1_param_0]; ; CHECK-F16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_imm1( @@ -768,7 +768,7 @@ define float @minimum_imm1(float %a) { ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm1_param_0]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float 0.0) ret float %x @@ -791,7 +791,7 @@ define float @minimum_imm2(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f4, %f1, %f3, %p2; ; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f5, %f4, %f3, %p3; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f5; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f5; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm2( @@ -801,7 +801,7 @@ define float @minimum_imm2(float %a) { ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_imm2_param_0]; ; CHECK-F16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_imm2( @@ -811,7 +811,7 @@ define float @minimum_imm2(float %a) { ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm2_param_0]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float 0.0, float %a) ret float %x @@ -838,7 +838,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f7; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float_ftz( @@ -849,7 +849,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_float_ftz_param_0]; ; CHECK-F16-NEXT: ld.param.f32 %f2, [minimum_float_ftz_param_1]; ; CHECK-F16-NEXT: min.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_float_ftz( @@ -860,7 +860,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_ftz_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_ftz_param_1]; ; CHECK-SM80-NOF16-NEXT: min.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float %b) ret float %x @@ -887,7 +887,7 @@ define double @minimum_double(double %a, double %b) { ; CHECK-NEXT: selp.f64 %fd6, %fd2, %fd5, %p3; ; CHECK-NEXT: setp.eq.f64 %p4, %fd4, 0d0000000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd6, %fd4, %p4; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd7; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %x = call double @llvm.minimum.f64(double %a, double %b) ret double %x @@ -933,7 +933,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; ; CHECK-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; ; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_v2half( @@ -944,7 +944,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-F16-NEXT: ld.param.b32 %r1, [minimum_v2half_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r2, [minimum_v2half_param_0]; ; CHECK-F16-NEXT: min.NaN.f16x2 %r3, %r2, %r1; -; CHECK-F16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_v2half( @@ -986,7 +986,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; ; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; -; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1007,7 +1007,7 @@ define half @maxnum_half(half %a, half %b) { ; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NOF16-NEXT: max.f32 %f3, %f2, %f1; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f3; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maxnum_half( @@ -1018,7 +1018,7 @@ define half @maxnum_half(half %a, half %b) { ; CHECK-F16-NEXT: ld.param.b16 %rs1, [maxnum_half_param_0]; ; CHECK-F16-NEXT: ld.param.b16 %rs2, [maxnum_half_param_1]; ; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2; -; CHECK-F16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maxnum_half( @@ -1033,7 +1033,7 @@ define half @maxnum_half(half %a, half %b) { ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-SM80-NOF16-NEXT: max.f32 %f3, %f2, %f1; ; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f3; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.maxnum.f16(half %a, half %b) ret half %x @@ -1047,7 +1047,7 @@ define float @maxnum_imm1(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [maxnum_imm1_param_0]; ; CHECK-NEXT: max.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float 0.0) ret float %x @@ -1061,7 +1061,7 @@ define float @maxnum_imm2(float %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.f32 %f1, [maxnum_imm2_param_0]; ; CHECK-NEXT: max.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float 0.0, float %a) ret float %x @@ -1076,7 +1076,7 @@ define float @maxnum_float(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f1, [maxnum_float_param_0]; ; CHECK-NEXT: ld.param.f32 %f2, [maxnum_float_param_1]; ; CHECK-NEXT: max.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float %b) ret float %x @@ -1091,7 +1091,7 @@ define float @maxnum_float_ftz(float %a, float %b) #1 { ; CHECK-NEXT: ld.param.f32 %f1, [maxnum_float_ftz_param_0]; ; CHECK-NEXT: ld.param.f32 %f2, [maxnum_float_ftz_param_1]; ; CHECK-NEXT: max.ftz.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float %b) ret float %x @@ -1106,7 +1106,7 @@ define double @maxnum_double(double %a, double %b) { ; CHECK-NEXT: ld.param.f64 %fd1, [maxnum_double_param_0]; ; CHECK-NEXT: ld.param.f64 %fd2, [maxnum_double_param_1]; ; CHECK-NEXT: max.f64 %fd3, %fd1, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %x = call double @llvm.maxnum.f64(double %a, double %b) ret double %x @@ -1133,7 +1133,7 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: max.f32 %f6, %f5, %f4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; ; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maxnum_v2half( @@ -1144,7 +1144,7 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-F16-NEXT: ld.param.b32 %r1, [maxnum_v2half_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r2, [maxnum_v2half_param_0]; ; CHECK-F16-NEXT: max.f16x2 %r3, %r2, %r1; -; CHECK-F16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maxnum_v2half( @@ -1167,7 +1167,7 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: max.f32 %f6, %f5, %f4; ; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; ; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1198,7 +1198,7 @@ define half @maximum_half(half %a, half %b) { ; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; ; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs9; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_half( @@ -1209,7 +1209,7 @@ define half @maximum_half(half %a, half %b) { ; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximum_half_param_0]; ; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximum_half_param_1]; ; CHECK-F16-NEXT: max.NaN.f16 %rs3, %rs1, %rs2; -; CHECK-F16-NEXT: st.param.b16 [func_retval0+0], %rs3; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_half( @@ -1234,7 +1234,7 @@ define half @maximum_half(half %a, half %b) { ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %f3, %rs5; ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %f3, 0f00000000; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs5, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0+0], %rs9; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs9; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.maximum.f16(half %a, half %b) ret half %x @@ -1253,7 +1253,7 @@ define float @maximum_imm1(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f3, 0f7FC00000, %f2, %p1; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f00000000, %f3, %p2; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm1( @@ -1263,7 +1263,7 @@ define float @maximum_imm1(float %a) { ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_imm1_param_0]; ; CHECK-F16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_imm1( @@ -1273,7 +1273,7 @@ define float @maximum_imm1(float %a) { ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm1_param_0]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float 0.0) ret float %x @@ -1292,7 +1292,7 @@ define float @maximum_imm2(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f3, 0f7FC00000, %f2, %p1; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f00000000, %f3, %p2; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm2( @@ -1302,7 +1302,7 @@ define float @maximum_imm2(float %a) { ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_imm2_param_0]; ; CHECK-F16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_imm2( @@ -1312,7 +1312,7 @@ define float @maximum_imm2(float %a) { ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm2_param_0]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float 0.0, float %a) ret float %x @@ -1339,7 +1339,7 @@ define float @maximum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f7; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float( @@ -1350,7 +1350,7 @@ define float @maximum_float(float %a, float %b) { ; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_float_param_0]; ; CHECK-F16-NEXT: ld.param.f32 %f2, [maximum_float_param_1]; ; CHECK-F16-NEXT: max.NaN.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_float( @@ -1361,7 +1361,7 @@ define float @maximum_float(float %a, float %b) { ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_param_1]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float %b) ret float %x @@ -1388,7 +1388,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0+0], %f7; +; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float_ftz( @@ -1399,7 +1399,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_float_ftz_param_0]; ; CHECK-F16-NEXT: ld.param.f32 %f2, [maximum_float_ftz_param_1]; ; CHECK-F16-NEXT: max.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_float_ftz( @@ -1410,7 +1410,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_ftz_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_ftz_param_1]; ; CHECK-SM80-NOF16-NEXT: max.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float %b) ret float %x @@ -1437,7 +1437,7 @@ define double @maximum_double(double %a, double %b) { ; CHECK-NEXT: selp.f64 %fd6, %fd2, %fd5, %p3; ; CHECK-NEXT: setp.eq.f64 %p4, %fd4, 0d0000000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd6, %fd4, %p4; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd7; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %x = call double @llvm.maximum.f64(double %a, double %b) ret double %x @@ -1483,7 +1483,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; ; CHECK-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; ; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_v2half( @@ -1494,7 +1494,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-F16-NEXT: ld.param.b32 %r1, [maximum_v2half_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r2, [maximum_v2half_param_0]; ; CHECK-F16-NEXT: max.NaN.f16x2 %r3, %r2, %r1; -; CHECK-F16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_v2half( @@ -1536,7 +1536,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %f6, 0f00000000; ; CHECK-SM80-NOF16-NEXT: selp.b16 %rs18, %rs17, %rs13, %p10; ; CHECK-SM80-NOF16-NEXT: mov.b32 %r3, {%rs18, %rs11}; -; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1554,7 +1554,7 @@ define float @fma_float(float %a, float %b, float %c) { ; CHECK-NEXT: ld.param.f32 %f2, [fma_float_param_1]; ; CHECK-NEXT: ld.param.f32 %f3, [fma_float_param_2]; ; CHECK-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NEXT: ret; %x = call float @llvm.fma.f32(float %a, float %b, float %c) ret float %x @@ -1570,7 +1570,7 @@ define float @fma_float_ftz(float %a, float %b, float %c) #1 { ; CHECK-NEXT: ld.param.f32 %f2, [fma_float_ftz_param_1]; ; CHECK-NEXT: ld.param.f32 %f3, [fma_float_ftz_param_2]; ; CHECK-NEXT: fma.rn.ftz.f32 %f4, %f1, %f2, %f3; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f4; +; CHECK-NEXT: st.param.f32 [func_retval0], %f4; ; CHECK-NEXT: ret; %x = call float @llvm.fma.f32(float %a, float %b, float %c) ret float %x @@ -1586,7 +1586,7 @@ define double @fma_double(double %a, double %b, double %c) { ; CHECK-NEXT: ld.param.f64 %fd2, [fma_double_param_1]; ; CHECK-NEXT: ld.param.f64 %fd3, [fma_double_param_2]; ; CHECK-NEXT: fma.rn.f64 %fd4, %fd1, %fd2, %fd3; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd4; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %x = call double @llvm.fma.f64(double %a, double %b, double %c) ret double %x diff --git a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll index efa99462b9b1..21fce55fcbc2 100644 --- a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll @@ -13,7 +13,7 @@ define i16 @test_mulhi_i16(i16 %x, i16 %y) { ; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_i16_param_1]; ; CHECK-NEXT: mul.hi.s16 %rs3, %rs1, %rs2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %1 = call i16 @llvm.nvvm.mulhi.s(i16 %x, i16 %y) ret i16 %1 @@ -30,7 +30,7 @@ define i16 @test_mulhi_u16(i16 %x, i16 %y) { ; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_u16_param_1]; ; CHECK-NEXT: mul.hi.u16 %rs3, %rs1, %rs2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %1 = call i16 @llvm.nvvm.mulhi.us(i16 %x, i16 %y) ret i16 %1 @@ -45,7 +45,7 @@ define i32 @test_mulhi_i32(i32 %x, i32 %y) { ; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_i32_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_i32_param_1]; ; CHECK-NEXT: mul.hi.s32 %r3, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %1 = call i32 @llvm.nvvm.mulhi.i(i32 %x, i32 %y) ret i32 %1 @@ -60,7 +60,7 @@ define i32 @test_mulhi_u32(i32 %x, i32 %y) { ; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_u32_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_u32_param_1]; ; CHECK-NEXT: mul.hi.u32 %r3, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %1 = call i32 @llvm.nvvm.mulhi.ui(i32 %x, i32 %y) ret i32 %1 @@ -75,7 +75,7 @@ define i64 @test_mulhi_i64(i64 %x, i64 %y) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_i64_param_0]; ; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_i64_param_1]; ; CHECK-NEXT: mul.hi.s64 %rd3, %rd1, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; %1 = call i64 @llvm.nvvm.mulhi.ll(i64 %x, i64 %y) ret i64 %1 @@ -90,7 +90,7 @@ define i64 @test_mulhi_u64(i64 %x, i64 %y) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_u64_param_0]; ; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_u64_param_1]; ; CHECK-NEXT: mul.hi.u64 %rd3, %rd1, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; %1 = call i64 @llvm.nvvm.mulhi.ull(i64 %x, i64 %y) ret i64 %1 diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll index 0088d6c64205..1e45df5efcf5 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -9,17 +9,17 @@ declare i32 @__nvvm_reflect(ptr) ; SM_52: .visible .func (.param .b32 func_retval0) foo() ; SM_52: mov.b32 %[[REG:.+]], 3; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_52-NEXT: ret; ; ; SM_70: .visible .func (.param .b32 func_retval0) foo() ; SM_70: mov.b32 %[[REG:.+]], 2; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_70-NEXT: ret; ; ; SM_90: .visible .func (.param .b32 func_retval0) foo() ; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_90-NEXT: ret; define i32 @foo() { entry: @@ -56,17 +56,17 @@ return: ; SM_52: .visible .func (.param .b32 func_retval0) bar() ; SM_52: mov.b32 %[[REG:.+]], 2; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_52-NEXT: ret; ; ; SM_70: .visible .func (.param .b32 func_retval0) bar() ; SM_70: mov.b32 %[[REG:.+]], 1; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_70-NEXT: ret; ; ; SM_90: .visible .func (.param .b32 func_retval0) bar() ; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_90-NEXT: ret; define i32 @bar() { entry: @@ -104,17 +104,17 @@ if.end: ; SM_52: .visible .func (.param .b32 func_retval0) qux() ; SM_52: mov.b32 %[[REG:.+]], 3; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_52-NEXT: ret; ; ; SM_70: .visible .func (.param .b32 func_retval0) qux() ; SM_70: mov.b32 %[[REG:.+]], 2; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_70-NEXT: ret; ; ; SM_90: .visible .func (.param .b32 func_retval0) qux() ; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_90-NEXT: ret; define i32 @qux() { entry: @@ -144,15 +144,15 @@ return: ; SM_52: .visible .func (.param .b32 func_retval0) phi() ; SM_52: mov.f32 %[[REG:.+]], 0f00000000; -; SM_52-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_52-NEXT: st.param.f32 [func_retval0], %[[REG]]; ; SM_52-NEXT: ret; ; SM_70: .visible .func (.param .b32 func_retval0) phi() ; SM_70: mov.f32 %[[REG:.+]], 0f00000000; -; SM_70-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_70-NEXT: st.param.f32 [func_retval0], %[[REG]]; ; SM_70-NEXT: ret; ; SM_90: .visible .func (.param .b32 func_retval0) phi() ; SM_90: mov.f32 %[[REG:.+]], 0f00000000; -; SM_90-NEXT: st.param.f32 [func_retval0+0], %[[REG]]; +; SM_90-NEXT: st.param.f32 [func_retval0], %[[REG]]; ; SM_90-NEXT: ret; define float @phi() { entry: @@ -177,17 +177,17 @@ exit: ; SM_52: .visible .func (.param .b32 func_retval0) prop() ; SM_52: mov.b32 %[[REG:.+]], 3; -; SM_52-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_52-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_52-NEXT: ret; ; ; SM_70: .visible .func (.param .b32 func_retval0) prop() ; SM_70: mov.b32 %[[REG:.+]], 2; -; SM_70-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_70-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_70-NEXT: ret; ; ; SM_90: .visible .func (.param .b32 func_retval0) prop() ; SM_90: mov.b32 %[[REG:.+]], 1; -; SM_90-NEXT: st.param.b32 [func_retval0+0], %[[REG:.+]]; +; SM_90-NEXT: st.param.b32 [func_retval0], %[[REG:.+]]; ; SM_90-NEXT: ret; define i32 @prop() { entry: diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index a29d4e1875cd..bb95f88e999d 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -30,13 +30,13 @@ ; CHECK: cvt.u32.u16 [[B:%r[0-9]+]], [[A8]] ; CHECK: and.b32 [[C:%r[0-9]+]], [[B]], 1; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[C]] +; CHECK: st.param.b32 [param0], [[C]] ; CHECK: .param .b32 retval0; ; CHECK: call.uni ; CHECK-NEXT: test_i1, -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; ; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define i1 @test_i1(i1 %a) { %r = tail call i1 @test_i1(i1 %a); @@ -53,13 +53,13 @@ define i1 @test_i1(i1 %a) { ; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; ; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: st.param.b32 [param0], [[A]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; ; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; ; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define signext i1 @test_i1s(i1 signext %a) { %r = tail call signext i1 @test_i1s(i1 signext %a); @@ -73,14 +73,14 @@ define signext i1 @test_i1s(i1 signext %a) { ; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; ; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK-DAG: st.param.b8 [param0+0], [[E0]]; +; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; ; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i1, -; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.b8 [func_retval0+0], [[RE0]] +; CHECK-DAG: st.param.b8 [func_retval0], [[RE0]] ; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; ; CHECK-NEXT: ret; define <3 x i1> @test_v3i1(<3 x i1> %a) { @@ -93,15 +93,15 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) { ; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1] ; CHECK: ld.param.u8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[E0]]; +; CHECK: st.param.b8 [param0], [[E0]]; ; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK: test_v4i1, -; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; +; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK: ld.param.b8 [[RE1:%rs[0-9]+]], [retval0+1]; ; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; ; CHECK: ld.param.b8 [[RE3:%rs[0-9]+]], [retval0+3]; -; CHECK: st.param.b8 [func_retval0+0], [[RE0]]; +; CHECK: st.param.b8 [func_retval0], [[RE0]]; ; CHECK: st.param.b8 [func_retval0+1], [[RE1]]; ; CHECK: st.param.b8 [func_retval0+2], [[RE2]]; ; CHECK: st.param.b8 [func_retval0+3], [[RE3]]; @@ -117,14 +117,14 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) { ; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; ; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK-DAG: st.param.b8 [param0+0], [[E0]]; +; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; ; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v5i1, -; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.b8 [func_retval0+0], [[RE0]] +; CHECK-DAG: st.param.b8 [func_retval0], [[RE0]] ; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i1> @test_v5i1(<5 x i1> %a) { @@ -137,12 +137,12 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) { ; CHECK-NEXT: .param .b32 test_i2_param_0 ; CHECK: ld.param.u8 {{%rs[0-9]+}}, [test_i2_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK: test_i2, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i2 @test_i2(i2 %a) { %r = tail call i2 @test_i2(i2 %a); @@ -154,12 +154,12 @@ define i2 @test_i2(i2 %a) { ; CHECK-NEXT: .param .b32 test_i3_param_0 ; CHECK: ld.param.u8 {{%rs[0-9]+}}, [test_i3_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK: test_i3, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i3 @test_i3(i3 %a) { %r = tail call i3 @test_i3(i3 %a); @@ -174,13 +174,13 @@ define i3 @test_i3(i3 %a) { ; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; ; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: st.param.b32 [param0], [[A]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK: test_i8, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; ; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i8 @test_i8(i8 %a) { %r = tail call i8 @test_i8(i8 %a); @@ -194,15 +194,15 @@ define i8 @test_i8(i8 %a) { ; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; ; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: st.param.b32 [param0], [[A]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK: test_i8s, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; ; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? ; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; ; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define signext i8 @test_i8s(i8 signext %a) { %r = tail call signext i8 @test_i8s(i8 signext %a); @@ -214,14 +214,14 @@ define signext i8 @test_i8s(i8 signext %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v3i8_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[R]] +; CHECK: st.param.b32 [param0], [[R]] ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i8, -; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0]; ; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very ; interesting here, so it's skipped. -; CHECK: st.param.b32 [func_retval0+0], +; CHECK: st.param.b32 [func_retval0], ; CHECK-NEXT: ret; define <3 x i8> @test_v3i8(<3 x i8> %a) { %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); @@ -233,12 +233,12 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0] ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[R]]; +; CHECK: st.param.b32 [param0], [[R]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i8, -; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[RET]]; +; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[RET]]; ; CHECK-NEXT: ret; define <4 x i8> @test_v4i8(<4 x i8> %a) { %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); @@ -251,14 +251,14 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) { ; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_v5i8_param_0] ; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], +; CHECK-DAG: st.param.v4.b8 [param0], ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v5i8, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.v4.b8 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} ; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i8> @test_v5i8(<5 x i8> %a) { @@ -270,12 +270,12 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) { ; CHECK-LABEL: test_i11( ; CHECK-NEXT: .param .b32 test_i11_param_0 ; CHECK: ld.param.u16 {{%rs[0-9]+}}, [test_i11_param_0]; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i11, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i11 @test_i11(i11 %a) { %r = tail call i11 @test_i11(i11 %a); @@ -288,13 +288,13 @@ define i11 @test_i11(i11 %a) { ; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; ; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i16, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; ; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i16 @test_i16(i16 %a) { %r = tail call i16 @test_i16(i16 %a); @@ -307,13 +307,13 @@ define i16 @test_i16(i16 %a) { ; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; ; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i16s, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; ; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define signext i16 @test_i16s(i16 signext %a) { %r = tail call signext i16 @test_i16s(i16 signext %a); @@ -327,14 +327,14 @@ define signext i16 @test_i16s(i16 signext %a) { ; CHECK-DAG: ld.param.u32 [[R:%r[0-9]+]], [test_v3i16_param_0]; ; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R]]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b16 [param0+4], [[E2]]; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i16, -; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0]; ; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; ; CHECK-NEXT: ret; define <3 x i16> @test_v3i16(<3 x i16> %a) { @@ -347,12 +347,12 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) { ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] ; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i16, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; +; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]} ; CHECK-NEXT: ret; define <4 x i16> @test_v4i16(<4 x i16> %a) { %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); @@ -365,14 +365,14 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) { ; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; ; CHECK-DAG: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v5i16, -; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} ; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i16> @test_v5i16(<5 x i16> %a) { @@ -385,12 +385,12 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) { ; CHECK-NEXT: .param .align 2 .b8 test_f16_param_0[2] ; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_f16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_f16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b16 [func_retval0], [[R]] ; CHECK-NEXT: ret; define half @test_f16(half %a) { %r = tail call half @test_f16(half %a); @@ -402,12 +402,12 @@ define half @test_f16(half %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2f16_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v2f16, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]] +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]] ; CHECK-NEXT: ret; define <2 x half> @test_v2f16(<2 x half> %a) { %r = tail call <2 x half> @test_v2f16(<2 x half> %a); @@ -419,12 +419,12 @@ define <2 x half> @test_v2f16(<2 x half> %a) { ; CHECK-NEXT: .param .align 2 .b8 test_bf16_param_0[2] ; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_bf16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_bf16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b16 [func_retval0], [[R]] ; CHECK-NEXT: ret; define bfloat @test_bf16(bfloat %a) { %r = tail call bfloat @test_bf16(bfloat %a); @@ -436,12 +436,12 @@ define bfloat @test_bf16(bfloat %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v2bf16_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2bf16_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v2bf16, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]] +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]] ; CHECK-NEXT: ret; define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) { %r = tail call <2 x bfloat> @test_v2bf16(<2 x bfloat> %a); @@ -456,14 +456,14 @@ define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) { ; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[HH01]]; ; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3f16_param_0+4]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; ; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK: test_v3f16, -; CHECK-DAG: ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b16 [[R2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[R0]], [[R1]]}; ; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; ; CHECK: ret; define <3 x half> @test_v3f16(<3 x half> %a) { @@ -476,12 +476,12 @@ define <3 x half> @test_v3f16(<3 x half> %a) { ; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] ; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0+0], {[[R01]], [[R23]]}; +; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]}; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK: test_v4f16, -; CHECK: ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; +; CHECK: ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0]; +; CHECK: st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]}; ; CHECK: ret; define <4 x half> @test_v4f16(<4 x half> %a) { %r = tail call <4 x half> @test_v4f16(<4 x half> %a); @@ -494,14 +494,14 @@ define <4 x half> @test_v4f16(<4 x half> %a) { ; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0]; ; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5f16_param_0+8]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.v4.b16 [param0], ; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK: test_v5f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b16 [[R4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; ; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; ; CHECK: ret; define <5 x half> @test_v5f16(<5 x half> %a) { @@ -514,12 +514,12 @@ define <5 x half> @test_v5f16(<5 x half> %a) { ; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] ; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[R01]], [[R23]], [[R45]], [[R67]]}; +; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]}; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK: test_v8f16, -; CHECK: ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; +; CHECK: ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0]; +; CHECK: st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; ; CHECK: ret; define <8 x half> @test_v8f16(<8 x half> %a) { %r = tail call <8 x half> @test_v8f16(<8 x half> %a); @@ -533,16 +533,16 @@ define <8 x half> @test_v8f16(<8 x half> %a) { ; CHECK-DAG: ld.param.v4.b16 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8]; ; CHECK-DAG: ld.param.b16 [[E8:%rs[0-9]+]], [test_v9f16_param_0+16]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.v4.b16 [param0], ; CHECK-DAG: st.param.v4.b16 [param0+8], ; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; ; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK: call.uni (retval0), ; CHECK: test_v9f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8]; ; CHECK-DAG: ld.param.b16 [[R8:%rs[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; ; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; ; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; ; CHECK: ret; @@ -557,12 +557,12 @@ define <9 x half> @test_v9f16(<9 x half> %a) { ; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i19_param_0]; ; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i19_param_0+2]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i19, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i19 @test_i19(i19 %a) { %r = tail call i19 @test_i19(i19 %a); @@ -575,12 +575,12 @@ define i19 @test_i19(i19 %a) { ; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i23_param_0]; ; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i23_param_0+2]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i23, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i23 @test_i23(i23 %a) { %r = tail call i23 @test_i23(i23 %a); @@ -593,12 +593,12 @@ define i23 @test_i23(i23 %a) { ; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i24_param_0+2]; ; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i24_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i24, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i24 @test_i24(i24 %a) { %r = tail call i24 @test_i24(i24 %a); @@ -610,12 +610,12 @@ define i24 @test_i24(i24 %a) { ; CHECK-NEXT: .param .b32 test_i29_param_0 ; CHECK: ld.param.u32 {{%r[0-9]+}}, [test_i29_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i29, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], {{%r[0-9]+}}; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; +; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; ; CHECK-NEXT: ret; define i29 @test_i29(i29 %a) { %r = tail call i29 @test_i29(i29 %a); @@ -627,12 +627,12 @@ define i29 @test_i29(i29 %a) { ; CHECK-NEXT: .param .b32 test_i32_param_0 ; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i32 @test_i32(i32 %a) { %r = tail call i32 @test_i32(i32 %a); @@ -645,14 +645,14 @@ define i32 @test_i32(i32 %a) { ; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; ; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b32 [param0+8], [[E2]]; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i32, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; ; CHECK-NEXT: ret; define <3 x i32> @test_v3i32(<3 x i32> %a) { @@ -665,12 +665,12 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) { ; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] ; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i32, -; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; +; CHECK: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} ; CHECK-NEXT: ret; define <4 x i32> @test_v4i32(<4 x i32> %a) { %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); @@ -683,14 +683,14 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) { ; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; ; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; ; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v5i32, -; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} ; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i32> @test_v5i32(<5 x i32> %a) { @@ -703,12 +703,12 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) { ; CHECK-NEXT: .param .b32 test_f32_param_0 ; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: st.param.f32 [param0], [[E]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0]; +; CHECK: st.param.f32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define float @test_f32(float %a) { %r = tail call float @test_f32(float %a); @@ -721,12 +721,12 @@ define float @test_f32(float %a) { ; CHECK-DAG: ld.param.u8 {{%rd[0-9]+}}, [test_i40_param_0+4]; ; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i40_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i40, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i40 @test_i40(i40 %a) { %r = tail call i40 @test_i40(i40 %a); @@ -739,12 +739,12 @@ define i40 @test_i40(i40 %a) { ; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i47_param_0+4]; ; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i47_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i47, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i47 @test_i47(i47 %a) { %r = tail call i47 @test_i47(i47 %a); @@ -757,12 +757,12 @@ define i47 @test_i47(i47 %a) { ; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i48_param_0+4]; ; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i48_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i48, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i48 @test_i48(i48 %a) { %r = tail call i48 @test_i48(i48 %a); @@ -776,12 +776,12 @@ define i48 @test_i48(i48 %a) { ; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i51_param_0+4]; ; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i51_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i51, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i51 @test_i51(i51 %a) { %r = tail call i51 @test_i51(i51 %a); @@ -795,12 +795,12 @@ define i51 @test_i51(i51 %a) { ; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i56_param_0+4]; ; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i56_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i56, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i56 @test_i56(i56 %a) { %r = tail call i56 @test_i56(i56 %a); @@ -812,12 +812,12 @@ define i56 @test_i56(i56 %a) { ; CHECK-NEXT: .param .b64 test_i57_param_0 ; CHECK: ld.param.u64 {{%rd[0-9]+}}, [test_i57_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i57, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], {{%rd[0-9]+}}; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; +; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; ; CHECK-NEXT: ret; define i57 @test_i57(i57 %a) { %r = tail call i57 @test_i57(i57 %a); @@ -829,12 +829,12 @@ define i57 @test_i57(i57 %a) { ; CHECK-NEXT: .param .b64 test_i64_param_0 ; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .b64 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; +; CHECK: st.param.b64 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define i64 @test_i64(i64 %a) { %r = tail call i64 @test_i64(i64 %a); @@ -847,16 +847,16 @@ define i64 @test_i64(i64 %a) { ; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; ; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b64 [param0+16], [[E2]]; ; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; ; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; ; CHECK-NEXT: ret; define <3 x i64> @test_v3i64(<3 x i64> %a) { @@ -871,15 +871,15 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) { ; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; ; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; ; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; ; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; ; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-NEXT: ret; define <4 x i64> @test_v4i64(<4 x i64> %a) { %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); @@ -893,12 +893,12 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) { ; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] ; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni ; CHECK-NEXT: test_s_i1, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b8 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_i1 @test_s_i1(%s_i1 %a) { %r = tail call %s_i1 @test_s_i1(%s_i1 %a); @@ -910,12 +910,12 @@ define %s_i1 @test_s_i1(%s_i1 %a) { ; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] ; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni ; CHECK-NEXT: test_s_i8, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b8 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_i8 @test_s_i8(%s_i8 %a) { %r = tail call %s_i8 @test_s_i8(%s_i8 %a); @@ -927,12 +927,12 @@ define %s_i8 @test_s_i8(%s_i8 %a) { ; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] ; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: st.param.b16 [param0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni ; CHECK-NEXT: test_s_i16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_i16 @test_s_i16(%s_i16 %a) { %r = tail call %s_i16 @test_s_i16(%s_i16 %a); @@ -944,12 +944,12 @@ define %s_i16 @test_s_i16(%s_i16 %a) { ; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: st.param.b16 [param0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; ; CHECK: call.uni ; CHECK-NEXT: test_s_f16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; +; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_f16 @test_s_f16(%s_f16 %a) { %r = tail call %s_f16 @test_s_f16(%s_f16 %a); @@ -961,12 +961,12 @@ define %s_f16 @test_s_f16(%s_f16 %a) { ; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] ; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_i32 @test_s_i32(%s_i32 %a) { %r = tail call %s_i32 @test_s_i32(%s_i32 %a); @@ -978,12 +978,12 @@ define %s_i32 @test_s_i32(%s_i32 %a) { ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] ; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: st.param.f32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0]; +; CHECK: st.param.f32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_f32 @test_s_f32(%s_f32 %a) { %r = tail call %s_f32 @test_s_f32(%s_f32 %a); @@ -995,12 +995,12 @@ define %s_f32 @test_s_f32(%s_f32 %a) { ; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] ; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; +; CHECK: st.param.b64 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_i64 @test_s_i64(%s_i64 %a) { %r = tail call %s_i64 @test_s_i64(%s_i64 %a); @@ -1017,7 +1017,7 @@ define %s_i64 @test_s_i64(%s_i64 %a) { ; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; ; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.b32 [param0], [[E0]]; ; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; ; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; ; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; @@ -1025,12 +1025,12 @@ define %s_i64 @test_s_i64(%s_i64 %a) { ; CHECK: .param .align 8 .b8 retval0[24]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i32f32, -; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; ; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; ; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; ; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.b32 [func_retval0], [[RE0]]; ; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; ; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; @@ -1049,16 +1049,16 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { ; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; ; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; ; CHECK: st.param.b64 [param0+16], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[24]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i32x4, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; ; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; ; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; ; CHECK: ret; @@ -1077,7 +1077,7 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { ; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; ; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[32]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b8 [param0+8], [[E2]]; ; CHECK: st.param.b32 [param0+12], [[E3]]; ; CHECK: st.param.b32 [param0+16], [[E4]]; @@ -1088,12 +1088,12 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { ; CHECK: ( ; CHECK: param0 ; CHECK: ); -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; ; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; ; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; ; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; ; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; ; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; @@ -1136,7 +1136,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; ; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b8 [param0+0], +; CHECK-DAG: st.param.b8 [param0], ; CHECK-DAG: st.param.b8 [param0+1], ; CHECK-DAG: st.param.b8 [param0+2], ; CHECK-DAG: st.param.b8 [param0+3], @@ -1164,7 +1164,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK: .param .align 1 .b8 retval0[25]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+0]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0]; ; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1]; ; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2]; ; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3]; @@ -1190,7 +1190,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23]; ; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b8 [func_retval0+0], +; CHECK-DAG: st.param.b8 [func_retval0], ; CHECK-DAG: st.param.b8 [func_retval0+1], ; CHECK-DAG: st.param.b8 [func_retval0+2], ; CHECK-DAG: st.param.b8 [func_retval0+3], @@ -1232,7 +1232,7 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { ; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; ; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; ; CHECK: .param .align 16 .b8 param0[80]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b32 [param0+8], [[E2]]; ; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; ; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; @@ -1241,13 +1241,13 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { ; CHECK: .param .align 16 .b8 retval0[80]; ; CHECK: call.uni (retval0), ; CHECK: test_s_crossfield, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; ; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; ; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; ; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; ; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; ; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; ; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 5c09bb8e1a5d..8c506fb0f75a 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -28,16 +28,16 @@ define float @caller_md(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f2, [caller_md_param_1]; ; CHECK-NEXT: { ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0+0], {%f1, %f2}; +; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2}; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: callee_md, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f32 %f3, [retval0+0]; +; CHECK-NEXT: ld.param.f32 %f3, [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %s1 = insertvalue %struct.float2 poison, float %a, 0 %s2 = insertvalue %struct.float2 %s1, float %b, 1 @@ -53,7 +53,7 @@ define float @callee_md(%struct.float2 %a) { ; CHECK: ld.param.v2.f32 {%f1, %f2}, [callee_md_param_0]; ; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %v0 = extractvalue %struct.float2 %a, 0 %v1 = extractvalue %struct.float2 %a, 1 @@ -72,16 +72,16 @@ define float @caller(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f2, [caller_param_1]; ; CHECK-NEXT: { ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0+0], {%f1, %f2}; +; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2}; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: callee, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f32 %f3, [retval0+0]; +; CHECK-NEXT: ld.param.f32 %f3, [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %s1 = insertvalue %struct.float2 poison, float %a, 0 %s2 = insertvalue %struct.float2 %s1, float %b, 1 @@ -97,7 +97,7 @@ define float @callee(%struct.float2 alignstack(8) %a ) { ; CHECK: ld.param.v2.f32 {%f1, %f2}, [callee_param_0]; ; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0+0], %f3; +; CHECK-NEXT: st.param.f32 [func_retval0], %f3; ; CHECK-NEXT: ret; %v0 = extractvalue %struct.float2 %a, 0 %v1 = extractvalue %struct.float2 %a, 1 diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 55fadf10f8d6..db8b1a6f53d1 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -84,14 +84,14 @@ define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x1_param_1 ; CHECK: ) ; CHECK: .param .b32 param0; - ; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; + ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: callee_St4x1, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; + ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; %1 = load i32, ptr %in, align 4 %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %1) #2 %.fca.0.extract = extractvalue [1 x i32] %call, 0 @@ -104,7 +104,7 @@ define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) { ; CHECK-LABEL: callee_St4x1( ; CHECK-NEXT: .param .b32 callee_St4x1_param_0 ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; - ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK: st.param.b32 [func_retval0], [[R1]]; ; CHECK-NEXT: ret; %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0 ret [1 x i32] %oldret @@ -116,14 +116,14 @@ define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x2_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[8]; - ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: callee_St4x2, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; %agg.tmp = alloca %struct.St4x2, align 8 %1 = load i64, ptr %in, align 4 store i64 %1, ptr %agg.tmp, align 8 @@ -141,7 +141,7 @@ define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x2( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8] ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; - ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 %arrayidx.1 = getelementptr inbounds [2 x i32], ptr %in, i64 0, i64 1 @@ -157,7 +157,7 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x3_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[12]; - ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[12]; ; CHECK: call.uni (retval0), @@ -165,7 +165,7 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8]; %call = tail call fastcc [3 x i32] @callee_St4x3(ptr noundef nonnull byval(%struct.St4x3) align 4 %in) #2 %.fca.0.extract = extractvalue [3 x i32] %call, 0 @@ -185,7 +185,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; - ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -205,14 +205,14 @@ define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x4_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: callee_St4x4, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; %call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2 %.fca.0.extract = extractvalue [4 x i32] %call, 0 %.fca.1.extract = extractvalue [4 x i32] %call, 1 @@ -233,7 +233,7 @@ define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x4( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16] ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1 @@ -255,7 +255,7 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x5_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[20]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[20]; ; CHECK: call.uni (retval0), @@ -263,7 +263,7 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16]; %call = tail call fastcc [5 x i32] @callee_St4x5(ptr noundef nonnull byval(%struct.St4x5) align 4 %in) #2 %.fca.0.extract = extractvalue [5 x i32] %call, 0 @@ -289,7 +289,7 @@ define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20] ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.b32 [func_retval0+16], [[R5]]; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -315,7 +315,7 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x6_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[24]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: call.uni (retval0), @@ -323,7 +323,7 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; %call = tail call fastcc [6 x i32] @callee_St4x6(ptr noundef nonnull byval(%struct.St4x6) align 4 %in) #2 %.fca.0.extract = extractvalue [6 x i32] %call, 0 @@ -352,7 +352,7 @@ define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24] ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -381,7 +381,7 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x7_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[28]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[28]; @@ -390,7 +390,7 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+24]; %call = tail call fastcc [7 x i32] @callee_St4x7(ptr noundef nonnull byval(%struct.St4x7) align 4 %in) #2 @@ -424,7 +424,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK: st.param.b32 [func_retval0+24], [[R7]]; ; CHECK-NEXT: ret; @@ -457,7 +457,7 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x8_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[32]; ; CHECK: call.uni (retval0), @@ -465,7 +465,7 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; %call = tail call fastcc [8 x i32] @callee_St4x8(ptr noundef nonnull byval(%struct.St4x8) align 4 %in) #2 %.fca.0.extract = extractvalue [8 x i32] %call, 0 @@ -500,7 +500,7 @@ define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32] ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; ; CHECK: ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -535,14 +535,14 @@ define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x1_param_1 ; CHECK: ) ; CHECK: .param .b64 param0; - ; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; + ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: callee_St8x1, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; + ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; %1 = load i64, ptr %in, align 8 %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %1) #2 %.fca.0.extract = extractvalue [1 x i64] %call, 0 @@ -555,7 +555,7 @@ define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) { ; CHECK-LABEL: callee_St8x1( ; CHECK-NEXT: .param .b64 callee_St8x1_param_0 ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; - ; CHECK: st.param.b64 [func_retval0+0], [[RD1]]; + ; CHECK: st.param.b64 [func_retval0], [[RD1]]; ; CHECK-NEXT: ret; %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0 ret [1 x i64] %oldret @@ -567,14 +567,14 @@ define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x2_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[16]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: callee_St8x2, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; %call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2 %.fca.0.extract = extractvalue [2 x i64] %call, 0 %.fca.1.extract = extractvalue [2 x i64] %call, 1 @@ -589,7 +589,7 @@ define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St8x2( ; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16] ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK-NEXT: ret; %1 = load i64, ptr %in, align 8 %arrayidx.1 = getelementptr inbounds [2 x i64], ptr %in, i64 0, i64 1 @@ -605,7 +605,7 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x3_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[24]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: call.uni (retval0), @@ -613,7 +613,7 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16]; %call = tail call fastcc [3 x i64] @callee_St8x3(ptr noundef nonnull byval(%struct.St8x3) align 8 %in) #2 %.fca.0.extract = extractvalue [3 x i64] %call, 0 @@ -633,7 +633,7 @@ define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24] ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK: st.param.b64 [func_retval0+16], [[RD3]]; ; CHECK-NEXT: ret; %1 = load i64, ptr %in, align 8 @@ -653,7 +653,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x4_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[32]; ; CHECK: call.uni (retval0), @@ -661,7 +661,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); - ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16]; %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2 %.fca.0.extract = extractvalue [4 x i64] %call, 0 @@ -684,7 +684,7 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by ; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32] ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; ; CHECK: ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; - ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]}; ; CHECK-NEXT: ret; %1 = load i64, ptr %in, align 8 @@ -708,7 +708,7 @@ define private fastcc [4 x i32] @callee_St4x4_private(ptr nocapture noundef read ; CHECK-LABEL: callee_St4x4_private( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16] ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; - ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1 @@ -735,7 +735,7 @@ define external fastcc [4 x i32] @callee_St4x4_external(ptr nocapture noundef re ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; - ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK: st.param.b32 [func_retval0], [[R1]]; ; CHECK: st.param.b32 [func_retval0+4], [[R2]]; ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK: st.param.b32 [func_retval0+12], [[R4]]; diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll index fa138f3d0936..4c9a2ee80c25 100644 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll @@ -12,14 +12,14 @@ declare i1 @callee_i1() define i1 @check_i1() { ; PTX-LABEL: check_i1 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]]; ; PTX-WITHOUT-DAG: and.b32 [[RES:%r[0-9]+]], [[PROXY]], 1; ; PTX-WITH-DAG: and.b32 [[RES:%r[0-9]+]], [[LD]], 1; - ; PTX-DAG: st.param.b32 [func_retval0+0], [[RES]]; + ; PTX-DAG: st.param.b32 [func_retval0], [[RES]]; %ret = call i1 @callee_i1() ret i1 %ret @@ -29,14 +29,14 @@ declare i16 @callee_i16() define i16 @check_i16() { ; PTX-LABEL: check_i16 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]]; ; PTX-WITHOUT-DAG: and.b32 [[RES:%r[0-9]+]], [[PROXY]], 65535; ; PTX-WITH-DAG: and.b32 [[RES:%r[0-9]+]], [[LD]], 65535; - ; PTX-DAG: st.param.b32 [func_retval0+0], [[RES]]; + ; PTX-DAG: st.param.b32 [func_retval0], [[RES]]; %ret = call i16 @callee_i16() ret i16 %ret @@ -46,12 +46,12 @@ declare i32 @callee_i32() define i32 @check_i32() { ; PTX-LABEL: check_i32 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.b32 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b32 [func_retval0], [[LD]]; %ret = call i32 @callee_i32() ret i32 %ret @@ -61,12 +61,12 @@ declare i64 @callee_i64() define i64 @check_i64() { ; PTX-LABEL: check_i64 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b64 [[LD:%rd[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b64 [[LD:%rd[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b64 [[PROXY:%rd[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.b64 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.b64 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b64 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b64 [func_retval0], [[LD]]; %ret = call i64 @callee_i64() ret i64 %ret @@ -76,13 +76,13 @@ declare i128 @callee_i128() define i128 @check_i128() { ; PTX-LABEL: check_i128 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.v2.b64 {[[LD0:%rd[0-9]+]], [[LD1:%rd[0-9]+]]}, [retval0+0]; + ; PTX-DAG: ld.param.v2.b64 {[[LD0:%rd[0-9]+]], [[LD1:%rd[0-9]+]]}, [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b64 [[PROXY0:%rd[0-9]+]], [[LD0]]; ; PTX-WITHOUT-DAG: mov.b64 [[PROXY1:%rd[0-9]+]], [[LD1]]; - ; PTX-WITHOUT-DAG: st.param.v2.b64 [func_retval0+0], {[[PROXY0]], [[PROXY1]]}; - ; PTX-WITH-DAG: st.param.v2.b64 [func_retval0+0], {[[LD0]], [[LD1]]}; + ; PTX-WITHOUT-DAG: st.param.v2.b64 [func_retval0], {[[PROXY0]], [[PROXY1]]}; + ; PTX-WITH-DAG: st.param.v2.b64 [func_retval0], {[[LD0]], [[LD1]]}; %ret = call i128 @callee_i128() ret i128 %ret @@ -92,12 +92,12 @@ declare half @callee_f16() define half @check_f16() { ; PTX-LABEL: check_f16 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b16 [[LD:%rs[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b16 [[LD:%rs[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b16 [[PROXY:%rs[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.b16 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.b16 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b16 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b16 [func_retval0], [[LD]]; %ret = call half @callee_f16() ret half %ret @@ -107,12 +107,12 @@ declare float @callee_f32() define float @check_f32() { ; PTX-LABEL: check_f32 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.f32 [[LD:%f[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.f32 [[LD:%f[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.f32 [[PROXY:%f[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.f32 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.f32 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.f32 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.f32 [func_retval0], [[LD]]; %ret = call float @callee_f32() ret float %ret @@ -122,12 +122,12 @@ declare double @callee_f64() define double @check_f64() { ; PTX-LABEL: check_f64 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.f64 [[LD:%fd[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.f64 [[LD:%fd[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.f64 [[PROXY:%fd[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.f64 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.f64 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.f64 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.f64 [func_retval0], [[LD]]; %ret = call double @callee_f64() ret double %ret @@ -137,15 +137,15 @@ declare <4 x i32> @callee_vec_i32() define <4 x i32> @check_vec_i32() { ; PTX-LABEL: check_vec_i32 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.v4.b32 {[[LD0:%r[0-9]+]], [[LD1:%r[0-9]+]], [[LD2:%r[0-9]+]], [[LD3:%r[0-9]+]]}, [retval0+0]; + ; PTX-DAG: ld.param.v4.b32 {[[LD0:%r[0-9]+]], [[LD1:%r[0-9]+]], [[LD2:%r[0-9]+]], [[LD3:%r[0-9]+]]}, [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY0:%r[0-9]+]], [[LD0]]; ; PTX-WITHOUT-DAG: mov.b32 [[PROXY1:%r[0-9]+]], [[LD1]]; ; PTX-WITHOUT-DAG: mov.b32 [[PROXY2:%r[0-9]+]], [[LD2]]; ; PTX-WITHOUT-DAG: mov.b32 [[PROXY3:%r[0-9]+]], [[LD3]]; - ; PTX-WITHOUT-DAG: st.param.v4.b32 [func_retval0+0], {[[PROXY0]], [[PROXY1]], [[PROXY2]], [[PROXY3]]}; - ; PTX-WITH-DAG: st.param.v4.b32 [func_retval0+0], {[[LD0]], [[LD1]], [[LD2]], [[LD3]]}; + ; PTX-WITHOUT-DAG: st.param.v4.b32 [func_retval0], {[[PROXY0]], [[PROXY1]], [[PROXY2]], [[PROXY3]]}; + ; PTX-WITH-DAG: st.param.v4.b32 [func_retval0], {[[LD0]], [[LD1]], [[LD2]], [[LD3]]}; %ret = call <4 x i32> @callee_vec_i32() ret <4 x i32> %ret @@ -155,12 +155,12 @@ declare <2 x half> @callee_vec_f16() define <2 x half> @check_vec_f16() { ; PTX-LABEL: check_vec_f16 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0+0]; + ; PTX-DAG: ld.param.b32 [[LD:%r[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%r[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0+0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.b32 [func_retval0+0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b32 [func_retval0], [[LD]]; %ret = call <2 x half> @callee_vec_f16() ret <2 x half> %ret @@ -170,13 +170,13 @@ declare <2 x double> @callee_vec_f64() define <2 x double> @check_vec_f64() { ; PTX-LABEL: check_vec_f64 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.v2.f64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0+0]; + ; PTX-DAG: ld.param.v2.f64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.f64 [[PROXY0:%fd[0-9]+]], [[LD0]]; ; PTX-WITHOUT-DAG: mov.f64 [[PROXY1:%fd[0-9]+]], [[LD1]]; - ; PTX-WITHOUT-DAG: st.param.v2.f64 [func_retval0+0], {[[PROXY0]], [[PROXY1]]}; - ; PTX-WITH-DAG: st.param.v2.f64 [func_retval0+0], {[[LD0]], [[LD1]]}; + ; PTX-WITHOUT-DAG: st.param.v2.f64 [func_retval0], {[[PROXY0]], [[PROXY1]]}; + ; PTX-WITH-DAG: st.param.v2.f64 [func_retval0], {[[LD0]], [[LD1]]}; %ret = call <2 x double> @callee_vec_f64() ret <2 x double> %ret diff --git a/llvm/test/CodeGen/NVPTX/rcp-opt.ll b/llvm/test/CodeGen/NVPTX/rcp-opt.ll index e2443c27e849..ccc3db540097 100644 --- a/llvm/test/CodeGen/NVPTX/rcp-opt.ll +++ b/llvm/test/CodeGen/NVPTX/rcp-opt.ll @@ -15,7 +15,7 @@ define double @test1(double %in) { ; CHECK-NEXT: ld.param.f64 %fd1, [test1_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %div = fdiv double 1.000000e+00, %in %neg = fsub double -0.000000e+00, %div @@ -33,7 +33,7 @@ define double @test2(double %in) { ; CHECK-NEXT: ld.param.f64 %fd1, [test2_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %div = fdiv double -1.000000e+00, %in ret double %div @@ -50,7 +50,7 @@ define double @test3(double %in) { ; CHECK-NEXT: ld.param.f64 %fd1, [test3_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %neg = fsub double -0.000000e+00, %in %div = fdiv double 1.000000e+00, %neg diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 6586393f83d4..4174fd2f3ec2 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -31,7 +31,7 @@ define i32 @rotate32(i32 %a, i32 %b) { ; SM20-NEXT: and.b32 %r6, %r5, 31; ; SM20-NEXT: shr.u32 %r7, %r1, %r6; ; SM20-NEXT: or.b32 %r8, %r4, %r7; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r8; +; SM20-NEXT: st.param.b32 [func_retval0], %r8; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate32( @@ -42,7 +42,7 @@ define i32 @rotate32(i32 %a, i32 %b) { ; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0]; ; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1]; ; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM35-NEXT: st.param.b32 [func_retval0], %r3; ; SM35-NEXT: ret; %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b) ret i32 %val @@ -65,7 +65,7 @@ define i64 @rotate64(i64 %a, i32 %b) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate64( @@ -82,7 +82,7 @@ define i64 @rotate64(i64 %a, i32 %b) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; ; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) ret i64 %val @@ -105,7 +105,7 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotateright64( @@ -122,7 +122,7 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; ; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) ret i64 %val @@ -140,7 +140,7 @@ define i32 @rotl0(i32 %x) { ; SM20-NEXT: shr.u32 %r2, %r1, 24; ; SM20-NEXT: shl.b32 %r3, %r1, 8; ; SM20-NEXT: or.b32 %r4, %r3, %r2; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM20-NEXT: st.param.b32 [func_retval0], %r4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl0( @@ -150,7 +150,7 @@ define i32 @rotl0(i32 %x) { ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0]; ; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM35-NEXT: st.param.b32 [func_retval0], %r2; ; SM35-NEXT: ret; %t0 = shl i32 %x, 8 %t1 = lshr i32 %x, 24 @@ -174,7 +174,7 @@ define i64 @rotl64(i64 %a, i64 %n) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64( @@ -191,7 +191,7 @@ define i64 @rotl64(i64 %a, i64 %n) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; ; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -208,7 +208,7 @@ define i64 @rotl64_imm(i64 %a) { ; SM20-NEXT: shr.u64 %rd2, %rd1, 62; ; SM20-NEXT: shl.b64 %rd3, %rd1, 2; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64_imm( @@ -220,7 +220,7 @@ define i64 @rotl64_imm(i64 %a) { ; SM35-NEXT: shr.u64 %rd2, %rd1, 62; ; SM35-NEXT: shl.b64 %rd3, %rd1, 2; ; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) ret i64 %val @@ -242,7 +242,7 @@ define i64 @rotr64(i64 %a, i64 %n) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64( @@ -259,7 +259,7 @@ define i64 @rotr64(i64 %a, i64 %n) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; ; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -276,7 +276,7 @@ define i64 @rotr64_imm(i64 %a) { ; SM20-NEXT: shl.b64 %rd2, %rd1, 62; ; SM20-NEXT: shr.u64 %rd3, %rd1, 2; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64_imm( @@ -288,7 +288,7 @@ define i64 @rotr64_imm(i64 %a) { ; SM35-NEXT: shl.b64 %rd2, %rd1, 62; ; SM35-NEXT: shr.u64 %rd3, %rd1, 2; ; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; +; SM35-NEXT: st.param.b64 [func_retval0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) ret i64 %val @@ -310,7 +310,7 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { ; SM20-NEXT: and.b32 %r8, %r7, 31; ; SM20-NEXT: shl.b32 %r9, %r6, %r8; ; SM20-NEXT: or.b32 %r10, %r9, %r5; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: st.param.b32 [func_retval0], %r10; ; SM20-NEXT: ret; ; ; SM35-LABEL: funnel_shift_right_32( @@ -322,7 +322,7 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { ; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1]; ; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2]; ; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: st.param.b32 [func_retval0], %r4; ; SM35-NEXT: ret; %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c) ret i32 %val @@ -344,7 +344,7 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { ; SM20-NEXT: and.b32 %r8, %r7, 31; ; SM20-NEXT: shr.u32 %r9, %r6, %r8; ; SM20-NEXT: or.b32 %r10, %r4, %r9; -; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: st.param.b32 [func_retval0], %r10; ; SM20-NEXT: ret; ; ; SM35-LABEL: funnel_shift_left_32( @@ -356,7 +356,7 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { ; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1]; ; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2]; ; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: st.param.b32 [func_retval0], %r4; ; SM35-NEXT: ret; %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) ret i32 %val @@ -379,7 +379,7 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shl.b64 %rd5, %rd4, %r4; ; SM20-NEXT: or.b64 %rd6, %rd5, %rd3; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: st.param.b64 [func_retval0], %rd6; ; SM20-NEXT: ret; ; ; SM35-LABEL: funnel_shift_right_64( @@ -398,7 +398,7 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shl.b64 %rd5, %rd4, %r4; ; SM35-NEXT: or.b64 %rd6, %rd5, %rd3; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: st.param.b64 [func_retval0], %rd6; ; SM35-NEXT: ret; %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) ret i64 %val @@ -421,7 +421,7 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { ; SM20-NEXT: and.b32 %r4, %r3, 63; ; SM20-NEXT: shr.u64 %rd5, %rd4, %r4; ; SM20-NEXT: or.b64 %rd6, %rd2, %rd5; -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: st.param.b64 [func_retval0], %rd6; ; SM20-NEXT: ret; ; ; SM35-LABEL: funnel_shift_left_64( @@ -440,7 +440,7 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { ; SM35-NEXT: and.b32 %r4, %r3, 63; ; SM35-NEXT: shr.u64 %rd5, %rd4, %r4; ; SM35-NEXT: or.b64 %rd6, %rd2, %rd5; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: st.param.b64 [func_retval0], %rd6; ; SM35-NEXT: ret; %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) ret i64 %val diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index 05fdb02ac747..d4851f55d93c 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -15,7 +15,7 @@ define i64 @rotate64(i64 %a, i32 %b) { ; CHECK-NEXT: shr.u64 %rd2, %rd1, 61; ; CHECK-NEXT: shl.b64 %rd3, %rd1, 3; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3) ret i64 %val @@ -31,7 +31,7 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; CHECK-NEXT: shl.b64 %rd2, %rd1, 61; ; CHECK-NEXT: shr.u64 %rd3, %rd1, 3; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3) ret i64 %val diff --git a/llvm/test/CodeGen/NVPTX/sad-intrins.ll b/llvm/test/CodeGen/NVPTX/sad-intrins.ll index a09413bc4e52..8258dca605e9 100644 --- a/llvm/test/CodeGen/NVPTX/sad-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/sad-intrins.ll @@ -14,7 +14,7 @@ define i16 @test_sad_i16(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_i16_param_2]; ; CHECK-NEXT: sad.s16 %rs4, %rs1, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %1 = call i16 @llvm.nvvm.sad.s(i16 %x, i16 %y, i16 %z) ret i16 %1 @@ -32,7 +32,7 @@ define i16 @test_sad_u16(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_u16_param_2]; ; CHECK-NEXT: sad.u16 %rs4, %rs1, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %1 = call i16 @llvm.nvvm.sad.us(i16 %x, i16 %y, i16 %z) ret i16 %1 @@ -48,7 +48,7 @@ define i32 @test_sad_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-NEXT: ld.param.u32 %r2, [test_sad_i32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_sad_i32_param_2]; ; CHECK-NEXT: sad.s32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %1 = call i32 @llvm.nvvm.sad.i(i32 %x, i32 %y, i32 %z) ret i32 %1 @@ -64,7 +64,7 @@ define i32 @test_sad_u32(i32 %x, i32 %y, i32 %z) { ; CHECK-NEXT: ld.param.u32 %r2, [test_sad_u32_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_sad_u32_param_2]; ; CHECK-NEXT: sad.u32 %r4, %r1, %r2, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %1 = call i32 @llvm.nvvm.sad.ui(i32 %x, i32 %y, i32 %z) ret i32 %1 @@ -80,7 +80,7 @@ define i64 @test_sad_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_i64_param_1]; ; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_i64_param_2]; ; CHECK-NEXT: sad.s64 %rd4, %rd1, %rd2, %rd3; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %1 = call i64 @llvm.nvvm.sad.ll(i64 %x, i64 %y, i64 %z) ret i64 %1 @@ -96,7 +96,7 @@ define i64 @test_sad_u64(i64 %x, i64 %y, i64 %z) { ; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_u64_param_1]; ; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_u64_param_2]; ; CHECK-NEXT: sad.u64 %rd4, %rd1, %rd2, %rd3; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %1 = call i64 @llvm.nvvm.sad.ull(i64 %x, i64 %y, i64 %z) ret i64 %1 diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index f471d47077cf..0cb0c1ba8c6b 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -19,7 +19,7 @@ define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) { ; CHECK-NEXT: selp.s16 %rs3, -1, 0, %p2; ; CHECK-NEXT: selp.s16 %rs4, -1, 0, %p1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; entry: %v = load <2 x i16>, ptr %p, align 4 @@ -62,7 +62,7 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; ; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; entry: %v = load <4 x i8>, ptr %p, align 4 diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index 29f27c1ba6cd..b178f5e05296 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -26,7 +26,7 @@ define void @st_param_i8_i16() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[4]; -; CHECK-NEXT: st.param.b8 [param0+0], 1; +; CHECK-NEXT: st.param.b8 [param0], 1; ; CHECK-NEXT: st.param.b16 [param0+2], 2; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_i8_i16, @@ -47,7 +47,7 @@ define void @st_param_i32() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.b32 [param0+0], 3; +; CHECK-NEXT: st.param.b32 [param0], 3; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_i32, ; CHECK-NEXT: ( @@ -67,7 +67,7 @@ define void @st_param_i64() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0+0], 4; +; CHECK-NEXT: st.param.b64 [param0], 4; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_i64, ; CHECK-NEXT: ( @@ -87,7 +87,7 @@ define void @st_param_f32() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 3, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.f32 [param0+0], 0f40A00000; +; CHECK-NEXT: st.param.f32 [param0], 0f40A00000; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_f32, ; CHECK-NEXT: ( @@ -107,7 +107,7 @@ define void @st_param_f64() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 4, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.f64 [param0+0], 0d4018000000000000; +; CHECK-NEXT: st.param.f64 [param0], 0d4018000000000000; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_f64, ; CHECK-NEXT: ( @@ -133,7 +133,7 @@ define void @st_param_v2_i8_ii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 5, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; -; CHECK-NEXT: st.param.v2.b8 [param0+0], {1, 2}; +; CHECK-NEXT: st.param.v2.b8 [param0], {1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i8, ; CHECK-NEXT: ( @@ -153,7 +153,7 @@ define void @st_param_v2_i8_ir(i8 %val) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v2_i8_ir_param_0]; ; CHECK-NEXT: { // callseq 6, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; -; CHECK-NEXT: st.param.v2.b8 [param0+0], {1, %rs1}; +; CHECK-NEXT: st.param.v2.b8 [param0], {1, %rs1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i8, ; CHECK-NEXT: ( @@ -175,7 +175,7 @@ define void @st_param_v2_i8_ri(i8 %val) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v2_i8_ri_param_0]; ; CHECK-NEXT: { // callseq 7, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; -; CHECK-NEXT: st.param.v2.b8 [param0+0], {%rs1, 2}; +; CHECK-NEXT: st.param.v2.b8 [param0], {%rs1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i8, ; CHECK-NEXT: ( @@ -197,7 +197,7 @@ define void @st_param_v2_i16_ii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 8, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v2.b16 [param0+0], {1, 2}; +; CHECK-NEXT: st.param.v2.b16 [param0], {1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i16, ; CHECK-NEXT: ( @@ -217,7 +217,7 @@ define void @st_param_v2_i16_ir(i16 %val) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v2_i16_ir_param_0]; ; CHECK-NEXT: { // callseq 9, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v2.b16 [param0+0], {1, %rs1}; +; CHECK-NEXT: st.param.v2.b16 [param0], {1, %rs1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i16, ; CHECK-NEXT: ( @@ -239,7 +239,7 @@ define void @st_param_v2_i16_ri(i16 %val) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v2_i16_ri_param_0]; ; CHECK-NEXT: { // callseq 10, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v2.b16 [param0+0], {%rs1, 2}; +; CHECK-NEXT: st.param.v2.b16 [param0], {%rs1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i16, ; CHECK-NEXT: ( @@ -261,7 +261,7 @@ define void @st_param_v2_i32_ii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 11, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.b32 [param0+0], {1, 2}; +; CHECK-NEXT: st.param.v2.b32 [param0], {1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i32, ; CHECK-NEXT: ( @@ -281,7 +281,7 @@ define void @st_param_v2_i32_ir(i32 %val) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v2_i32_ir_param_0]; ; CHECK-NEXT: { // callseq 12, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.b32 [param0+0], {1, %r1}; +; CHECK-NEXT: st.param.v2.b32 [param0], {1, %r1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i32, ; CHECK-NEXT: ( @@ -303,7 +303,7 @@ define void @st_param_v2_i32_ri(i32 %val) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v2_i32_ri_param_0]; ; CHECK-NEXT: { // callseq 13, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.b32 [param0+0], {%r1, 2}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i32, ; CHECK-NEXT: ( @@ -325,7 +325,7 @@ define void @st_param_v2_i64_ii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 14, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.b64 [param0+0], {1, 2}; +; CHECK-NEXT: st.param.v2.b64 [param0], {1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i64, ; CHECK-NEXT: ( @@ -345,7 +345,7 @@ define void @st_param_v2_i64_ir(i64 %val) { ; CHECK-NEXT: ld.param.u64 %rd1, [st_param_v2_i64_ir_param_0]; ; CHECK-NEXT: { // callseq 15, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.b64 [param0+0], {1, %rd1}; +; CHECK-NEXT: st.param.v2.b64 [param0], {1, %rd1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i64, ; CHECK-NEXT: ( @@ -367,7 +367,7 @@ define void @st_param_v2_i64_ri(i64 %val) { ; CHECK-NEXT: ld.param.u64 %rd1, [st_param_v2_i64_ri_param_0]; ; CHECK-NEXT: { // callseq 16, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd1, 2}; +; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, 2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_i64, ; CHECK-NEXT: ( @@ -389,7 +389,7 @@ define void @st_param_v2_f32_ii(float %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 17, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0+0], {0f3F800000, 0f40000000}; +; CHECK-NEXT: st.param.v2.f32 [param0], {0f3F800000, 0f40000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -409,7 +409,7 @@ define void @st_param_v2_f32_ir(float %val) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v2_f32_ir_param_0]; ; CHECK-NEXT: { // callseq 18, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0+0], {0f3F800000, %f1}; +; CHECK-NEXT: st.param.v2.f32 [param0], {0f3F800000, %f1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -431,7 +431,7 @@ define void @st_param_v2_f32_ri(float %val) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v2_f32_ri_param_0]; ; CHECK-NEXT: { // callseq 19, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0+0], {%f1, 0f40000000}; +; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, 0f40000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -453,7 +453,7 @@ define void @st_param_v2_f64_ii(double %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 20, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0+0], {0d3FF0000000000000, 0d4000000000000000}; +; CHECK-NEXT: st.param.v2.f64 [param0], {0d3FF0000000000000, 0d4000000000000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -473,7 +473,7 @@ define void @st_param_v2_f64_ir(double %val) { ; CHECK-NEXT: ld.param.f64 %fd1, [st_param_v2_f64_ir_param_0]; ; CHECK-NEXT: { // callseq 21, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0+0], {0d3FF0000000000000, %fd1}; +; CHECK-NEXT: st.param.v2.f64 [param0], {0d3FF0000000000000, %fd1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -495,7 +495,7 @@ define void @st_param_v2_f64_ri(double %val) { ; CHECK-NEXT: ld.param.f64 %fd1, [st_param_v2_f64_ri_param_0]; ; CHECK-NEXT: { // callseq 22, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0+0], {%fd1, 0d4000000000000000}; +; CHECK-NEXT: st.param.v2.f64 [param0], {%fd1, 0d4000000000000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -524,7 +524,7 @@ define void @st_param_v4_i8_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 23, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -546,7 +546,7 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_irrr_param_2]; ; CHECK-NEXT: { // callseq 24, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, %rs1, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -572,7 +572,7 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rirr_param_2]; ; CHECK-NEXT: { // callseq 25, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, 2, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -598,7 +598,7 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rrir_param_2]; ; CHECK-NEXT: { // callseq 26, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, %rs3}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -624,7 +624,7 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) { ; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rrri_param_2]; ; CHECK-NEXT: { // callseq 27, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, %rs2, %rs3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -649,7 +649,7 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_iirr_param_1]; ; CHECK-NEXT: { // callseq 28, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, 2, %rs1, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -674,7 +674,7 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_irir_param_1]; ; CHECK-NEXT: { // callseq 29, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, %rs1, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -699,7 +699,7 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_irri_param_1]; ; CHECK-NEXT: { // callseq 30, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, %rs1, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -724,7 +724,7 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_riir_param_1]; ; CHECK-NEXT: { // callseq 31, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, 2, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -749,7 +749,7 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_riri_param_1]; ; CHECK-NEXT: { // callseq 32, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, 2, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -774,7 +774,7 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) { ; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_rrii_param_1]; ; CHECK-NEXT: { // callseq 33, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, %rs2, 3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -798,7 +798,7 @@ define void @st_param_v4_i8_iiir(i8 %d) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_iiir_param_0]; ; CHECK-NEXT: { // callseq 34, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, 2, 3, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, %rs1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -822,7 +822,7 @@ define void @st_param_v4_i8_iiri(i8 %c) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_iiri_param_0]; ; CHECK-NEXT: { // callseq 35, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, 2, %rs1, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -846,7 +846,7 @@ define void @st_param_v4_i8_irii(i8 %b) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_irii_param_0]; ; CHECK-NEXT: { // callseq 36, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {1, %rs1, 3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -870,7 +870,7 @@ define void @st_param_v4_i8_riii(i8 %a) { ; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_riii_param_0]; ; CHECK-NEXT: { // callseq 37, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0+0], {%rs1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i8, ; CHECK-NEXT: ( @@ -894,7 +894,7 @@ define void @st_param_v4_i16_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 38, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -916,7 +916,7 @@ define void @st_param_v4_i16_irrr(i16 %b, i16 %c, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_irrr_param_2]; ; CHECK-NEXT: { // callseq 39, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, %rs1, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, %rs2, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -942,7 +942,7 @@ define void @st_param_v4_i16_rirr(i16 %a, i16 %c, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rirr_param_2]; ; CHECK-NEXT: { // callseq 40, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, 2, %rs2, %rs3}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, %rs2, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -968,7 +968,7 @@ define void @st_param_v4_i16_rrir(i16 %a, i16 %b, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rrir_param_2]; ; CHECK-NEXT: { // callseq 41, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, %rs3}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, %rs3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -994,7 +994,7 @@ define void @st_param_v4_i16_rrri(i16 %a, i16 %b, i16 %c) { ; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rrri_param_2]; ; CHECK-NEXT: { // callseq 42, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, %rs2, %rs3, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1019,7 +1019,7 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_iirr_param_1]; ; CHECK-NEXT: { // callseq 43, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, 2, %rs1, %rs2}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1044,7 +1044,7 @@ define void @st_param_v4_i16_irir(i16 %b, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_irir_param_1]; ; CHECK-NEXT: { // callseq 44, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, %rs1, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1069,7 +1069,7 @@ define void @st_param_v4_i16_irri(i16 %b, i16 %c) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_irri_param_1]; ; CHECK-NEXT: { // callseq 45, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, %rs1, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, %rs2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1094,7 +1094,7 @@ define void @st_param_v4_i16_riir(i16 %a, i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_riir_param_1]; ; CHECK-NEXT: { // callseq 46, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, 2, 3, %rs2}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, %rs2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1119,7 +1119,7 @@ define void @st_param_v4_i16_riri(i16 %a, i16 %c) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_riri_param_1]; ; CHECK-NEXT: { // callseq 47, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, 2, %rs2, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, %rs2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1144,7 +1144,7 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) { ; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_rrii_param_1]; ; CHECK-NEXT: { // callseq 48, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, %rs2, 3, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1168,7 +1168,7 @@ define void @st_param_v4_i16_iiir(i16 %d) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_iiir_param_0]; ; CHECK-NEXT: { // callseq 49, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, 2, 3, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, %rs1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1192,7 +1192,7 @@ define void @st_param_v4_i16_iiri(i16 %c) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_iiri_param_0]; ; CHECK-NEXT: { // callseq 50, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, 2, %rs1, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1216,7 +1216,7 @@ define void @st_param_v4_i16_irii(i16 %b) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_irii_param_0]; ; CHECK-NEXT: { // callseq 51, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {1, %rs1, 3, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1240,7 +1240,7 @@ define void @st_param_v4_i16_riii(i16 %a) { ; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_riii_param_0]; ; CHECK-NEXT: { // callseq 52, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0+0], {%rs1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i16, ; CHECK-NEXT: ( @@ -1264,7 +1264,7 @@ define void @st_param_v4_i32_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 53, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1286,7 +1286,7 @@ define void @st_param_v4_i32_irrr(i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_irrr_param_2]; ; CHECK-NEXT: { // callseq 54, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, %r1, %r2, %r3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, %r2, %r3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1312,7 +1312,7 @@ define void @st_param_v4_i32_rirr(i32 %a, i32 %c, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rirr_param_2]; ; CHECK-NEXT: { // callseq 55, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, 2, %r2, %r3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, %r2, %r3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1338,7 +1338,7 @@ define void @st_param_v4_i32_rrir(i32 %a, i32 %b, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rrir_param_2]; ; CHECK-NEXT: { // callseq 56, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, %r2, 3, %r3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, 3, %r3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1364,7 +1364,7 @@ define void @st_param_v4_i32_rrri(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rrri_param_2]; ; CHECK-NEXT: { // callseq 57, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, %r2, %r3, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, %r3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1389,7 +1389,7 @@ define void @st_param_v4_i32_iirr(i32 %c, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_iirr_param_1]; ; CHECK-NEXT: { // callseq 58, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, 2, %r1, %r2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, %r1, %r2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1414,7 +1414,7 @@ define void @st_param_v4_i32_irir(i32 %b, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_irir_param_1]; ; CHECK-NEXT: { // callseq 59, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, %r1, 3, %r2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, 3, %r2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1439,7 +1439,7 @@ define void @st_param_v4_i32_irri(i32 %b, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_irri_param_1]; ; CHECK-NEXT: { // callseq 60, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, %r1, %r2, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, %r2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1464,7 +1464,7 @@ define void @st_param_v4_i32_riir(i32 %a, i32 %d) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_riir_param_1]; ; CHECK-NEXT: { // callseq 61, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, 2, 3, %r2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, 3, %r2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1489,7 +1489,7 @@ define void @st_param_v4_i32_riri(i32 %a, i32 %c) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_riri_param_1]; ; CHECK-NEXT: { // callseq 62, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, 2, %r2, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, %r2, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1514,7 +1514,7 @@ define void @st_param_v4_i32_rrii(i32 %a, i32 %b) { ; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_rrii_param_1]; ; CHECK-NEXT: { // callseq 63, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, %r2, 3, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1538,7 +1538,7 @@ define void @st_param_v4_i32_iiir(i32 %d) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_iiir_param_0]; ; CHECK-NEXT: { // callseq 64, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, 2, 3, %r1}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, 3, %r1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1562,7 +1562,7 @@ define void @st_param_v4_i32_iiri(i32 %c) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_iiri_param_0]; ; CHECK-NEXT: { // callseq 65, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, 2, %r1, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, %r1, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1586,7 +1586,7 @@ define void @st_param_v4_i32_irii(i32 %b) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_irii_param_0]; ; CHECK-NEXT: { // callseq 66, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {1, %r1, 3, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1610,7 +1610,7 @@ define void @st_param_v4_i32_riii(i32 %a) { ; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_riii_param_0]; ; CHECK-NEXT: { // callseq 67, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.b32 [param0+0], {%r1, 2, 3, 4}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, 3, 4}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_i32, ; CHECK-NEXT: ( @@ -1634,7 +1634,7 @@ define void @st_param_v4_f32_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 68, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1656,7 +1656,7 @@ define void @st_param_v4_f32_irrr(float %b, float %c, float %d) { ; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_irrr_param_2]; ; CHECK-NEXT: { // callseq 69, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, %f1, %f2, %f3}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1682,7 +1682,7 @@ define void @st_param_v4_f32_rirr(float %a, float %c, float %d) { ; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rirr_param_2]; ; CHECK-NEXT: { // callseq 70, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f2, %f3}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1708,7 +1708,7 @@ define void @st_param_v4_f32_rrir(float %a, float %b, float %d) { ; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rrir_param_2]; ; CHECK-NEXT: { // callseq 71, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, %f3}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1734,7 +1734,7 @@ define void @st_param_v4_f32_rrri(float %a, float %b, float %c) { ; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rrri_param_2]; ; CHECK-NEXT: { // callseq 72, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, %f2, %f3, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, %f3, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1759,7 +1759,7 @@ define void @st_param_v4_f32_iirr(float %c, float %d) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_iirr_param_1]; ; CHECK-NEXT: { // callseq 73, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f1, %f2}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1784,7 +1784,7 @@ define void @st_param_v4_f32_irir(float %b, float %d) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_irir_param_1]; ; CHECK-NEXT: { // callseq 74, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, %f1, 0f40400000, %f2}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1809,7 +1809,7 @@ define void @st_param_v4_f32_irri(float %b, float %c) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_irri_param_1]; ; CHECK-NEXT: { // callseq 75, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, %f1, %f2, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1834,7 +1834,7 @@ define void @st_param_v4_f32_riir(float %a, float %d) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_riir_param_1]; ; CHECK-NEXT: { // callseq 76, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, %f2}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1859,7 +1859,7 @@ define void @st_param_v4_f32_riri(float %a, float %c) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_riri_param_1]; ; CHECK-NEXT: { // callseq 77, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, 0f40000000, %f2, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1884,7 +1884,7 @@ define void @st_param_v4_f32_rrii(float %a, float %b) { ; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_rrii_param_1]; ; CHECK-NEXT: { // callseq 78, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, %f2, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1908,7 +1908,7 @@ define void @st_param_v4_f32_iiir(float %d) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_iiir_param_0]; ; CHECK-NEXT: { // callseq 79, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, 0f40400000, %f1}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1932,7 +1932,7 @@ define void @st_param_v4_f32_iiri(float %c) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_iiri_param_0]; ; CHECK-NEXT: { // callseq 80, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, 0f40000000, %f1, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1956,7 +1956,7 @@ define void @st_param_v4_f32_irii(float %b) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_irii_param_0]; ; CHECK-NEXT: { // callseq 81, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {0f3F800000, %f1, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1980,7 +1980,7 @@ define void @st_param_v4_f32_riii(float %a) { ; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_riii_param_0]; ; CHECK-NEXT: { // callseq 82, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0+0], {%f1, 0f40000000, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll index 109d28a3e3c5..1b991ab82db8 100644 --- a/llvm/test/CodeGen/NVPTX/store-undef.ll +++ b/llvm/test/CodeGen/NVPTX/store-undef.ll @@ -38,7 +38,7 @@ define void @test_store_param_def(i64 %param0, i32 %param1) { ; CHECK-NEXT: ld.param.u32 %r1, [test_store_param_def_param_1]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; -; CHECK-NEXT: st.param.b64 [param0+0], %rd1; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r2, %r1}; ; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5}; ; CHECK-NEXT: call.uni diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll index 107671d1d1f3..473bc28ed4ee 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -58,8 +58,8 @@ define void @baz(ptr %red, i32 %idx) { ; SM20: texfunc, ; SM30: texfunc, %texcall = tail call float @texfunc(i64 %texHandle) -; SM20: ld.param.f32 %f[[TEXCALL:[0-9]+]], [[[RETVAL]]+0] -; SM30: ld.param.f32 %f[[TEXCALL:[0-9]+]], [[[RETVAL]]+0] +; SM20: ld.param.f32 %f[[TEXCALL:[0-9]+]], [[[RETVAL]]] +; SM30: ld.param.f32 %f[[TEXCALL:[0-9]+]], [[[RETVAL]]] ; SM20: add.rn.f32 %f[[RET2:[0-9]+]], %f[[RED]], %f[[TEXCALL]] ; SM30: add.rn.f32 %f[[RET2:[0-9]+]], %f[[RED]], %f[[TEXCALL]] %ret2 = fadd float %ret, %texcall diff --git a/llvm/test/CodeGen/NVPTX/tid-range.ll b/llvm/test/CodeGen/NVPTX/tid-range.ll index c4dd33960d44..4af4cc384535 100644 --- a/llvm/test/CodeGen/NVPTX/tid-range.ll +++ b/llvm/test/CodeGen/NVPTX/tid-range.ll @@ -13,7 +13,7 @@ entry: ; CHECK-LABEL: test1( ; CHECK: setp.eq.s32 %p1, %r1, 1; ; CHECK: selp.u32 %[[R:.+]], 1, 0, %p1; -; CHECK: st.param.b32 [func_retval0+0], %[[R]]; +; CHECK: st.param.b32 [func_retval0], %[[R]]; declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index 40a3e9e945a2..7dd751cab630 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -29,7 +29,7 @@ ; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; ; CHECK: { // callseq ; CHECK: .param .align 8 .b8 param0[16]; -; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b16 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; ; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; ; CHECK: .param .align 8 .b8 retval0[16]; @@ -38,11 +38,11 @@ ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+3]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+4]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b16 [func_retval0], [[R0]]; ; CHECK-DAG: shl.b16 [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8; ; CHECK-DAG: and.b16 [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255; ; CHECK-DAG: or.b16 [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]]; @@ -74,7 +74,7 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; ; CHECK: { // callseq ; CHECK-DAG: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b32 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; ; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; @@ -85,13 +85,13 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; ; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b32 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+5], ; CHECK-DAG: st.param.b8 [func_retval0+6], ; CHECK-DAG: st.param.b8 [func_retval0+7], @@ -137,7 +137,7 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; ; CHECK: { // callseq ; CHECK: .param .align 8 .b8 param0[32]; -; CHECK-DAG: st.param.b64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b64 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; ; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; @@ -152,7 +152,7 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.b64 [[R0:%rd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b64 [[R0:%rd[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; @@ -162,7 +162,7 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; ; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b64 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+9], ; CHECK-DAG: st.param.b8 [func_retval0+10], ; CHECK-DAG: st.param.b8 [func_retval0+11], @@ -188,7 +188,7 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; ; CHECK: { // callseq ; CHECK: .param .align 8 .b8 param0[16]; -; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b16 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; ; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; ; CHECK: .param .align 8 .b8 retval0[16]; @@ -197,11 +197,11 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2I_0:%rs[0-9]+]], [retval0+3]; ; CHECK-DAG: ld.param.b8 [[R2I_1:%rs[0-9]+]], [retval0+4]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b16 [func_retval0], [[R0]]; ; CHECK-DAG: shl.b16 [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8; ; CHECK-DAG: and.b16 [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255; ; CHECK-DAG: or.b16 [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]]; @@ -233,7 +233,7 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; ; CHECK: { // callseq ; CHECK-DAG: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b32 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; ; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; @@ -244,13 +244,13 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; ; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; ; CHECK: } // callseq -; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b32 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+5], ; CHECK-DAG: st.param.b8 [func_retval0+6], ; CHECK-DAG: st.param.b8 [func_retval0+7], @@ -280,7 +280,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; ; CHECK: { // callseq ; CHECK-DAG: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.f32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.f32 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; ; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; @@ -291,13 +291,13 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; ; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; ; CHECK: } // callseq -; CHECK-DAG: st.param.f32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.f32 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+5], ; CHECK-DAG: st.param.b8 [func_retval0+6], ; CHECK-DAG: st.param.b8 [func_retval0+7], @@ -343,7 +343,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; ; CHECK: { // callseq ; CHECK: .param .align 8 .b8 param0[32]; -; CHECK-DAG: st.param.f64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.f64 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; ; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; @@ -358,7 +358,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; @@ -368,7 +368,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; ; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; ; CHECK: } // callseq -; CHECK-DAG: st.param.f64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.f64 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+9], ; CHECK-DAG: st.param.b8 [func_retval0+10], ; CHECK-DAG: st.param.b8 [func_retval0+11], diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index 8633b09af048..044d21643ed9 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -18,7 +18,7 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-NEXT: not.b16 %rs5, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; ; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs7; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs7; ; CHECK-NEXT: ret; %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -44,7 +44,7 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin ; CHECK-NEXT: not.b16 %rs5, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; ; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs7; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-NEXT: ret; %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -70,7 +70,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-NEXT: xor.b32 %r7, %r1, -1; ; CHECK-NEXT: and.b32 %r8, %r3, %r7; ; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -92,7 +92,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi ; CHECK-NEXT: xor.b32 %r7, %r1, -16711681; ; CHECK-NEXT: and.b32 %r8, %r3, %r7; ; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %mx = and <4 x i8> %x, %mask %notmask = xor <4 x i8> %mask, @@ -114,7 +114,7 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin ; CHECK-NEXT: xor.b32 %r7, %r1, -1; ; CHECK-NEXT: and.b32 %r8, %r3, %r7; ; CHECK-NEXT: or.b32 %r9, %r5, %r8; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %mx = and <2 x i16> %x, %mask %notmask = xor <2 x i16> %mask, @@ -136,7 +136,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin ; CHECK-NEXT: not.b32 %r5, %r2; ; CHECK-NEXT: and.b32 %r6, %r4, %r5; ; CHECK-NEXT: or.b32 %r7, %r3, %r6; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -166,7 +166,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-NEXT: and.b32 %r18, %r2, %r15; ; CHECK-NEXT: or.b32 %r19, %r13, %r18; ; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; ; CHECK-NEXT: ret; %mx = and <8 x i8> %x, %mask %notmask = xor <8 x i8> %mask, @@ -192,7 +192,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-NEXT: and.b32 %r18, %r2, %r15; ; CHECK-NEXT: or.b32 %r19, %r13, %r18; ; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; ; CHECK-NEXT: ret; %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -218,7 +218,7 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n ; CHECK-NEXT: and.b32 %r18, %r2, %r15; ; CHECK-NEXT: or.b32 %r19, %r13, %r18; ; CHECK-NEXT: or.b32 %r20, %r11, %r17; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r20, %r19}; ; CHECK-NEXT: ret; %mx = and <4 x i16> %x, %mask %notmask = xor <4 x i16> %mask, @@ -244,7 +244,7 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin ; CHECK-NEXT: and.b32 %r12, %r8, %r9; ; CHECK-NEXT: or.b32 %r13, %r6, %r12; ; CHECK-NEXT: or.b32 %r14, %r5, %r11; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r14, %r13}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; ; CHECK-NEXT: ret; %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -266,7 +266,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin ; CHECK-NEXT: not.b64 %rd5, %rd2; ; CHECK-NEXT: and.b64 %rd6, %rd4, %rd5; ; CHECK-NEXT: or.b64 %rd7, %rd3, %rd6; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; ; CHECK-NEXT: ret; %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -304,7 +304,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-NEXT: or.b32 %r38, %r25, %r35; ; CHECK-NEXT: or.b32 %r39, %r23, %r34; ; CHECK-NEXT: or.b32 %r40, %r21, %r33; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37}; ; CHECK-NEXT: ret; %mx = and <16 x i8> %x, %mask %notmask = xor <16 x i8> %mask, @@ -338,7 +338,7 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-NEXT: or.b32 %r38, %r25, %r35; ; CHECK-NEXT: or.b32 %r39, %r23, %r34; ; CHECK-NEXT: or.b32 %r40, %r21, %r33; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37}; ; CHECK-NEXT: ret; %mx = and <8 x i16> %x, %mask %notmask = xor <8 x i16> %mask, @@ -372,7 +372,7 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin ; CHECK-NEXT: or.b32 %r26, %r11, %r23; ; CHECK-NEXT: or.b32 %r27, %r10, %r22; ; CHECK-NEXT: or.b32 %r28, %r9, %r21; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r28, %r27, %r26, %r25}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -403,7 +403,7 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n ; CHECK-NEXT: or.b32 %r23, %r12, %r22; ; CHECK-NEXT: or.b32 %r24, %r11, %r21; ; CHECK-NEXT: or.b32 %r25, %r10, %r20; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r25, %r24, %r9, %r23}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -429,7 +429,7 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwin ; CHECK-NEXT: and.b64 %rd12, %rd8, %rd9; ; CHECK-NEXT: or.b64 %rd13, %rd6, %rd12; ; CHECK-NEXT: or.b64 %rd14, %rd5, %rd11; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd14, %rd13}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd14, %rd13}; ; CHECK-NEXT: ret; %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, @@ -458,7 +458,7 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-NEXT: ld.param.u8 %rs4, [in_v1i8_param_2]; ; CHECK-NEXT: and.b16 %rs5, %rs3, %rs4; ; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs6; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs6; ; CHECK-NEXT: ret; %n0 = xor <1 x i8> %x, %y %n1 = and <1 x i8> %n0, %mask @@ -482,7 +482,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind ; CHECK-NEXT: ld.param.u16 %rs4, [in_v1i16_param_2]; ; CHECK-NEXT: and.b16 %rs5, %rs3, %rs4; ; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs6; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NEXT: ret; %n0 = xor <1 x i16> %x, %y %n1 = and <1 x i16> %n0, %mask @@ -506,7 +506,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-NEXT: ld.param.u32 %r4, [in_v4i8_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -526,7 +526,7 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind ; CHECK-NEXT: ld.param.u32 %r4, [in_v2i16_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -546,7 +546,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind ; CHECK-NEXT: ld.param.u32 %r4, [in_v1i32_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %n0 = xor <1 x i32> %x, %y %n1 = and <1 x i32> %n0, %mask @@ -573,7 +573,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-NEXT: xor.b32 %r11, %r1, %r3; ; CHECK-NEXT: and.b32 %r12, %r11, %r5; ; CHECK-NEXT: xor.b32 %r13, %r12, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r13, %r9}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r13, %r9}; ; CHECK-NEXT: ret; %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -596,7 +596,7 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind ; CHECK-NEXT: xor.b32 %r11, %r1, %r3; ; CHECK-NEXT: and.b32 %r12, %r11, %r5; ; CHECK-NEXT: xor.b32 %r13, %r12, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r13, %r9}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r13, %r9}; ; CHECK-NEXT: ret; %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -619,7 +619,7 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind ; CHECK-NEXT: and.b32 %r10, %r5, %r8; ; CHECK-NEXT: xor.b32 %r11, %r10, %r4; ; CHECK-NEXT: xor.b32 %r12, %r9, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r12, %r11}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NEXT: ret; %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -639,7 +639,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind ; CHECK-NEXT: ld.param.u64 %rd4, [in_v1i64_param_2]; ; CHECK-NEXT: and.b64 %rd5, %rd3, %rd4; ; CHECK-NEXT: xor.b64 %rd6, %rd5, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd6; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %n0 = xor <1 x i64> %x, %y %n1 = and <1 x i64> %n0, %mask @@ -672,7 +672,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-NEXT: xor.b32 %r23, %r19, %r7; ; CHECK-NEXT: xor.b32 %r25, %r18, %r6; ; CHECK-NEXT: xor.b32 %r27, %r17, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r27, %r25, %r23, %r21}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21}; ; CHECK-NEXT: ret; %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -701,7 +701,7 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind ; CHECK-NEXT: xor.b32 %r23, %r19, %r7; ; CHECK-NEXT: xor.b32 %r25, %r18, %r6; ; CHECK-NEXT: xor.b32 %r27, %r17, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r27, %r25, %r23, %r21}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21}; ; CHECK-NEXT: ret; %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -730,7 +730,7 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind ; CHECK-NEXT: xor.b32 %r22, %r19, %r7; ; CHECK-NEXT: xor.b32 %r23, %r18, %r6; ; CHECK-NEXT: xor.b32 %r24, %r17, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r24, %r23, %r22, %r21}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r24, %r23, %r22, %r21}; ; CHECK-NEXT: ret; %n0 = xor <4 x i32> %x, %y %n1 = and <4 x i32> %n0, %mask @@ -753,7 +753,7 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind ; CHECK-NEXT: and.b64 %rd10, %rd5, %rd8; ; CHECK-NEXT: xor.b64 %rd11, %rd10, %rd4; ; CHECK-NEXT: xor.b64 %rd12, %rd9, %rd3; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd12, %rd11}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd12, %rd11}; ; CHECK-NEXT: ret; %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index b8c213de04f8..8ecdff9d65ac 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -17,55 +17,55 @@ entry: ; Test va_start ; CHECK: .param .align 8 .b8 foo_vararg[] ; CHECK: mov.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg; -; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR]]; +; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR]]; call void @llvm.va_start(ptr %al) ; Test va_copy() -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0]; +; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: st.u[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]]; call void @llvm.va_copy(ptr %al2, ptr %al) ; Test va_arg(ap, int32_t) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0]; +; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 3; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 4; -; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.u32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %0 = va_arg ptr %al, i32 ; Test va_arg(ap, int64_t) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0]; +; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.u64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %1 = va_arg ptr %al, i64 ; Test va_arg(ap, double) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0]; +; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.f64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %2 = va_arg ptr %al, double ; Test va_arg(ap, ptr) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP+0]; +; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK32-NEXT: add.s32 [[VA_PTR_TMP:%r[0-9]+]], [[VA_PTR]], 3; ; CHECK64-NEXT: add.s64 [[VA_PTR_TMP:%rd[0-9]+]], [[VA_PTR]], 7; ; CHECK32-NEXT: and.b32 [[VA_PTR_ALIGN:%r[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK64-NEXT: and.b64 [[VA_PTR_ALIGN:%rd[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK32-NEXT: add.s32 [[VA_PTR_NEXT:%r[0-9]+]], [[VA_PTR_ALIGN]], 4; ; CHECK64-NEXT: add.s64 [[VA_PTR_NEXT:%rd[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP+0], [[VA_PTR_NEXT]]; +; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; ; CHECK-NEXT: ld.local.u[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]]; %3 = va_arg ptr %al, ptr @@ -91,7 +91,7 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) { ; Store arguments to an array ; CHECK32: .param .align 8 .b8 param1[24]; ; CHECK64: .param .align 8 .b8 param1[28]; -; CHECK-NEXT: st.param.b32 [param1+0], [[ARG_I32]]; +; CHECK-NEXT: st.param.b32 [param1], [[ARG_I32]]; ; CHECK-NEXT: st.param.b64 [param1+4], [[ARG_I64]]; ; CHECK-NEXT: st.param.f64 [param1+12], [[ARG_DOUBLE]]; ; CHECK-NEXT: st.param.b[[BITS]] [param1+20], [[ARG_VOID_PTR]]; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 0e0c89d3e021..6d14986b7ff3 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -42,7 +42,7 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9; ; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4; ; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r10; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -112,7 +112,7 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: mov.u64 %rd1, 4294967297; -; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: st.u64 [%SP], %rd1; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; ; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; ; CHECK-PTX-NEXT: mov.u64 %rd2, 1; @@ -123,9 +123,9 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 0, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd4; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4; ; CHECK-PTX-NEXT: .param .b32 retval0; ; CHECK-PTX-NEXT: call.uni (retval0), ; CHECK-PTX-NEXT: variadics1, @@ -133,9 +133,9 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: param0, ; CHECK-PTX-NEXT: param1 ; CHECK-PTX-NEXT: ); -; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0]; ; CHECK-PTX-NEXT: } // callseq 0 -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-PTX-NEXT: ret; entry: %conv = sext i8 1 to i32 @@ -174,14 +174,14 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd7]; ; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8; ; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2; -; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs5; +; CHECK-PTX-NEXT: st.u16 [%SP], %rs5; ; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8]; ; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; ; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5; ; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8; ; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -237,7 +237,7 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5; ; CHECK-PTX-NEXT: shl.b16 %rs7, %rs6, 8; ; CHECK-PTX-NEXT: or.b16 %rs8, %rs7, %rs4; -; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs8; +; CHECK-PTX-NEXT: st.u16 [%SP], %rs8; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; ; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; ; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8; @@ -248,9 +248,9 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd5; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5; ; CHECK-PTX-NEXT: .param .b32 retval0; ; CHECK-PTX-NEXT: call.uni (retval0), ; CHECK-PTX-NEXT: variadics2, @@ -258,9 +258,9 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: param0, ; CHECK-PTX-NEXT: param1 ; CHECK-PTX-NEXT: ); -; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0]; ; CHECK-PTX-NEXT: } // callseq 1 -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-PTX-NEXT: ret; entry: %s1.sroa.3 = alloca [3 x i8], align 1 @@ -286,7 +286,7 @@ define dso_local i32 @variadics3(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3; ; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -321,13 +321,13 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot5; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; -; CHECK-PTX-NEXT: st.v4.u32 [%SP+0], {%r1, %r1, %r1, %r1}; +; CHECK-PTX-NEXT: st.v4.u32 [%SP], {%r1, %r1, %r1, %r1}; ; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 2, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd1; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: .param .b32 retval0; ; CHECK-PTX-NEXT: call.uni (retval0), ; CHECK-PTX-NEXT: variadics3, @@ -335,9 +335,9 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: param0, ; CHECK-PTX-NEXT: param1 ; CHECK-PTX-NEXT: ); -; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0]; ; CHECK-PTX-NEXT: } // callseq 2 -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-PTX-NEXT: ret; entry: %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef ) @@ -360,7 +360,7 @@ define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, . ; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6; ; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4; ; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd8; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -395,7 +395,7 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot7; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: ld.global.nc.u64 %rd1, [__const_$_qux_$_s]; -; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: st.u64 [%SP], %rd1; ; CHECK-PTX-NEXT: mov.u64 %rd2, __const_$_qux_$_s; ; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 8; ; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3]; @@ -405,10 +405,10 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; -; CHECK-PTX-NEXT: st.param.b64 [param0+0], %rd1; +; CHECK-PTX-NEXT: st.param.b64 [param0], %rd1; ; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd4; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd6; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd6; ; CHECK-PTX-NEXT: .param .b32 retval0; ; CHECK-PTX-NEXT: call.uni (retval0), ; CHECK-PTX-NEXT: variadics4, @@ -416,7 +416,7 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: param0, ; CHECK-PTX-NEXT: param1 ; CHECK-PTX-NEXT: ); -; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-PTX-NEXT: } // callseq 3 ; CHECK-PTX-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index f4f5c26be347..9a190a0892e5 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -9,7 +9,7 @@ define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; ; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; ; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} ; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} ; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} ; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} @@ -21,7 +21,7 @@ define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( ; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; ; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} ; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} ; CHECK: ret; ret <8 x float> %a @@ -30,7 +30,7 @@ define <8 x float> @test_v8f32(<8 x float> %a) { define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( ; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} ; CHECK: ret; ret <4 x float> %a } @@ -38,7 +38,7 @@ define <4 x float> @test_v4f32(<4 x float> %a) { define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( ; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; -; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_3]]} ; CHECK: ret; ret <2 x float> %a } @@ -48,7 +48,7 @@ define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( ; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; ; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; -; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_1]]} ; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] ; CHECK: ret; ret <3 x float> %a @@ -60,7 +60,7 @@ define <8 x i64> @test_v8i64(<8 x i64> %a) { ; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; ; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; ; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_1]]} ; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} ; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} ; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} @@ -72,7 +72,7 @@ define <16 x i16> @test_v16i16(<16 x i16> %a) { ; CHECK-LABEL: test_v16i16( ; CHECK-DAG: ld.param.v4.u32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; ; CHECK-DAG: ld.param.v4.u32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[V_0_7]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_7]]} ; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_8_15]]} ; CHECK: ret; ret <16 x i16> %a diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll index 162061ff34ba..2a45c8271e9b 100644 --- a/llvm/test/CodeGen/NVPTX/vector-args.ll +++ b/llvm/test/CodeGen/NVPTX/vector-args.ll @@ -29,7 +29,7 @@ define <4 x float> @baz(<4 x float> %a) { ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) baz ; CHECK: .param .align 16 .b8 baz_param_0[16] ; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} -; CHECK: st.param.v4.f32 [func_retval0+0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: st.param.v4.f32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <4 x float> %a, %a ret <4 x float> %t1 } diff --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll index 15e4697333cb..e91d4e20a44a 100644 --- a/llvm/test/CodeGen/NVPTX/vector-call.ll +++ b/llvm/test/CodeGen/NVPTX/vector-call.ll @@ -8,7 +8,7 @@ declare void @bar(<4 x i32>) ; CHECK-LABEL: .func foo( ; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK: call.uni ; CHECK: ret; define void @foo(<4 x i32> %a) { @@ -20,7 +20,7 @@ define void @foo(<4 x i32> %a) { ; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; ; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; ; CHECK: call.uni ; CHECK: ret; diff --git a/llvm/test/CodeGen/NVPTX/vector-returns.ll b/llvm/test/CodeGen/NVPTX/vector-returns.ll index 956f74392ae1..520736c4cec5 100644 --- a/llvm/test/CodeGen/NVPTX/vector-returns.ll +++ b/llvm/test/CodeGen/NVPTX/vector-returns.ll @@ -10,7 +10,7 @@ define <3 x i64> @long3() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u64 %rd1, 0; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd1, %rd1}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd1}; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd1; ; CHECK-NEXT: ret; ret <3 x i64> zeroinitializer @@ -23,7 +23,7 @@ define <2 x i64> @long2() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u64 %rd1, 0; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd1, %rd1}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd1}; ; CHECK-NEXT: ret; ret <2 x i64> zeroinitializer } @@ -35,7 +35,7 @@ define <1 x i64> @long1() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u64 %rd1, 0; -; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; ret <1 x i64> zeroinitializer } @@ -47,7 +47,7 @@ define <5 x i32> @int5() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r1, %r1, %r1}; ; CHECK-NEXT: st.param.b32 [func_retval0+16], %r1; ; CHECK-NEXT: ret; ret <5 x i32> zeroinitializer @@ -60,7 +60,7 @@ define <4 x i32> @int4() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r1, %r1, %r1}; ; CHECK-NEXT: ret; ret <4 x i32> zeroinitializer } @@ -72,7 +72,7 @@ define <3 x i32> @int3() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r1, %r1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r1}; ; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1; ; CHECK-NEXT: ret; ret <3 x i32> zeroinitializer @@ -85,7 +85,7 @@ define <2 x i32> @int2() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r1, %r1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r1}; ; CHECK-NEXT: ret; ret <2 x i32> zeroinitializer } @@ -97,7 +97,7 @@ define <1 x i32> @int1() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <1 x i32> zeroinitializer } @@ -109,7 +109,7 @@ define <9 x i16> @short9() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+16], %rs1; ; CHECK-NEXT: ret; @@ -123,7 +123,7 @@ define <8 x i16> @short8() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r1, %r1, %r1}; ; CHECK-NEXT: ret; ret <8 x i16> zeroinitializer } @@ -135,7 +135,7 @@ define <7 x i16> @short7() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b16 [func_retval0+8], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+12], %rs1; ; CHECK-NEXT: ret; @@ -149,7 +149,7 @@ define <5 x i16> @short5() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b16 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs1; ; CHECK-NEXT: ret; ret <5 x i16> zeroinitializer @@ -162,7 +162,7 @@ define <4 x i16> @short4() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r1, %r1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r1}; ; CHECK-NEXT: ret; ret <4 x i16> zeroinitializer } @@ -174,7 +174,7 @@ define <3 x i16> @short3() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v2.b16 [func_retval0+0], {%rs1, %rs1}; +; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b16 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; ret <3 x i16> zeroinitializer @@ -187,7 +187,7 @@ define <2 x i16> @short2() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <2 x i16> zeroinitializer } @@ -199,7 +199,7 @@ define <1 x i16> @short1() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b16 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i16> zeroinitializer } @@ -211,7 +211,7 @@ define <17 x i8> @byte17() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1}; @@ -227,7 +227,7 @@ define <16 x i8> @byte16() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r1, %r1, %r1, %r1}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r1, %r1, %r1}; ; CHECK-NEXT: ret; ret <16 x i8> zeroinitializer } @@ -239,7 +239,7 @@ define <15 x i8> @byte15() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+12], {%rs1, %rs1}; @@ -255,7 +255,7 @@ define <9 x i8> @byte9() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; ; CHECK-NEXT: ret; @@ -269,7 +269,7 @@ define <8 x i8> @byte8() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r1, %r1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r1}; ; CHECK-NEXT: ret; ret <8 x i8> zeroinitializer } @@ -281,7 +281,7 @@ define <7 x i8> @byte7() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+6], %rs1; ; CHECK-NEXT: ret; @@ -295,7 +295,7 @@ define <5 x i8> @byte5() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; ; CHECK-NEXT: ret; ret <5 x i8> zeroinitializer @@ -308,7 +308,7 @@ define <4 x i8> @byte4() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <4 x i8> zeroinitializer } @@ -320,7 +320,7 @@ define <3 x i8> @byte3() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <3 x i8> zeroinitializer } @@ -332,7 +332,7 @@ define <2 x i8> @byte2() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; ret <2 x i8> zeroinitializer } @@ -344,7 +344,7 @@ define <1 x i8> @byte1() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i8> zeroinitializer } @@ -356,7 +356,7 @@ define <17 x i1> @bit17() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs1, %rs1, %rs1, %rs1}; +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1}; ; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs1, %rs1, %rs1, %rs1}; @@ -372,7 +372,7 @@ define <16 x i1> @bit16() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {%rs1, %rs1}; +; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+6], {%rs1, %rs1}; @@ -391,7 +391,7 @@ define <15 x i1> @bit15() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {%rs1, %rs1}; +; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+6], {%rs1, %rs1}; @@ -410,7 +410,7 @@ define <9 x i1> @bit9() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {%rs1, %rs1}; +; CHECK-NEXT: st.param.v2.b8 [func_retval0], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+2], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+4], {%rs1, %rs1}; ; CHECK-NEXT: st.param.v2.b8 [func_retval0+6], {%rs1, %rs1}; @@ -426,7 +426,7 @@ define <8 x i1> @bit8() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs1; @@ -445,7 +445,7 @@ define <7 x i1> @bit7() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs1; @@ -463,7 +463,7 @@ define <5 x i1> @bit5() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs1; @@ -479,7 +479,7 @@ define <4 x i1> @bit4() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs1; @@ -494,7 +494,7 @@ define <3 x i1> @bit3() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs1; ; CHECK-NEXT: ret; @@ -508,7 +508,7 @@ define <2 x i1> @bit2() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs1; ; CHECK-NEXT: ret; ret <2 x i1> zeroinitializer @@ -521,7 +521,7 @@ define <1 x i1> @bit1() { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.u16 %rs1, 0; -; CHECK-NEXT: st.param.b8 [func_retval0+0], %rs1; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: ret; ret <1 x i1> zeroinitializer } diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll index 116ab7e3978c..31517939a4b7 100644 --- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll @@ -10,7 +10,7 @@ ; CHECK: .loc 1 5 3 // t.c:5:3 ; CHECK: { // callseq 0, 0 ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], %rd1; +; CHECK: st.param.b64 [param0], %rd1; ; CHECK: call.uni ; CHECK: escape_foo, ; CHECK: ( diff --git a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll index c9c1406a0fa8..face96f85975 100644 --- a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll +++ b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll @@ -22,7 +22,7 @@ define void @nary_reassociate_after_slsr(i32 %a, i32 %b, i32 %c) { %abc = add i32 %ab, %c call void @foo(i32 %abc) ; CHECK: call void @foo(i32 %abc) -; PTX: st.param.b32 [param0+0], [[abc:%r[0-9]+]]; +; PTX: st.param.b32 [param0], [[abc:%r[0-9]+]]; %b2 = shl i32 %b, 1 %ab2 = add i32 %a, %b2 @@ -31,7 +31,7 @@ define void @nary_reassociate_after_slsr(i32 %a, i32 %b, i32 %c) { ; PTX: add.s32 [[ab2c:%r[0-9]+]], [[abc]], [[b]] call void @foo(i32 %ab2c) ; CHECK-NEXT: call void @foo(i32 %ab2c) -; PTX: st.param.b32 [param0+0], [[ab2c]]; +; PTX: st.param.b32 [param0], [[ab2c]]; %b3 = mul i32 %b, 3 %ab3 = add i32 %a, %b3 @@ -40,7 +40,7 @@ define void @nary_reassociate_after_slsr(i32 %a, i32 %b, i32 %c) { ; PTX: add.s32 [[ab3c:%r[0-9]+]], [[ab2c]], [[b]] call void @foo(i32 %ab3c) ; CHECK-NEXT: call void @foo(i32 %ab3c) -; PTX: st.param.b32 [param0+0], [[ab3c]]; +; PTX: st.param.b32 [param0], [[ab3c]]; ret void } diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index 5c9af3bb44da..a64364019de1 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -23,10 +23,10 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+8]; ; CHECK-NEXT: st.u64 [%SP+8], %rd3; ; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0]; -; CHECK-NEXT: st.u64 [%SP+0], %rd4; +; CHECK-NEXT: st.u64 [%SP], %rd4; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; -; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd4, %rd3}; +; CHECK-NEXT: st.param.v2.b64 [param0], {%rd4, %rd3}; ; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd2, %rd1}; ; CHECK-NEXT: .param .align 16 .b8 retval0[32]; ; CHECK-NEXT: call.uni (retval0), @@ -34,7 +34,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0+0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: st.u64 [%r1], %rd5; @@ -66,7 +66,7 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [callee_St8x4_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [callee_St8x4_param_0+16]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; ; CHECK-NEXT: ret; %1 = load i64, ptr %in, align 8 -- GitLab From 02bf3b54c02643069ad1a952c19f97cab00a3241 Mon Sep 17 00:00:00 2001 From: Felix Schneider Date: Sat, 19 Oct 2024 18:25:27 +0200 Subject: [PATCH 158/511] [mlir][linalg] Add quantized conv2d operator with FCHW,NCHW order (#107740) This patch adds a quantized version of the `linalg.conv2d_nchw_fchw` Op. This is the "channel-first" ordering typically used by PyTorch and others. --- .../Linalg/IR/LinalgNamedStructuredOps.yaml | 137 ++++++++++++++++++ .../linalg/opdsl/ops/core_named_ops.py | 29 ++++ mlir/test/Dialect/Linalg/roundtrip.mlir | 30 ++++ 3 files changed, 196 insertions(+) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index 8cb698096ef5..bf2f26de26e9 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -3114,6 +3114,143 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: KZp --- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: conv_2d_nchw_fchw_q + cpp_class_name: Conv2DNchwFchwQOp + doc: |- + Performs 2-D convolution with zero point offsets. + + Layout: + * Input: NCHW. + * Kernel: FCHW. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. + implements: + - LinalgConvolutionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: I + kind: input_tensor + type_var: T1 + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10] -> (s0, + s1, s2 * s3 + s4 * s5, s6 * s7 + s8 * s9)> + - !LinalgOperandDefConfig + name: K + kind: input_tensor + type_var: T2 + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10] -> (s10, + s1, s4, s8)> + - !LinalgOperandDefConfig + name: IZp + kind: scalar + type_var: I32 + - !LinalgOperandDefConfig + name: KZp + kind: scalar + type_var: I32 + - !LinalgOperandDefConfig + name: O + kind: output_tensor + type_var: U + shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10] -> (s0, + s10, s2, s6)> + - !LinalgOperandDefConfig + name: strides + kind: index_attr + index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10] -> + (s3, s7)> + default_indices: + - 1 + - 1 + - !LinalgOperandDefConfig + name: dilations + kind: index_attr + index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10] -> + (s5, s9)> + default_indices: + - 1 + - 1 + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10] -> (d0, d4, d2 * s3 + d5 * s5, d3 * s7 + d6 * s9)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10] -> (d1, d4, d5, d6)> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10] -> ()> + - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, + s9, s10] -> (d0, d1, d2, d3)> + iterator_types: + - parallel + - parallel + - parallel + - parallel + - reduction + - reduction + - reduction + assignments: + - !ScalarAssign + arg: O + value: !ScalarExpression + scalar_fn: + kind: binary + fn_name: add + operands: + - !ScalarExpression + scalar_arg: O + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: mul + operands: + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: sub + operands: + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: I + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: IZp + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: sub + operands: + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: K + - !ScalarExpression + scalar_fn: + kind: type + fn_name: cast_signed + type_var: U + operands: + - !ScalarExpression + scalar_arg: KZp +--- !LinalgOpConfig metadata: !LinalgOpMetadata name: conv_2d_nchw_fchw cpp_class_name: Conv2DNchwFchwOp diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index e4a6ec7487bb..b45fecd0ee14 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -876,6 +876,35 @@ def conv_2d_nhwc_fhwc_q( ) * (TypeFn.cast_signed(U, K[D.f, D.kh, D.kw, D.c]) - TypeFn.cast_signed(U, KZp)) +@linalg_structured_op +def conv_2d_nchw_fchw_q( + I=TensorDef(T1, S.N, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW), + K=TensorDef(T2, S.F, S.C, S.KH, S.KW), + IZp=ScalarDef(I32), + KZp=ScalarDef(I32), + O=TensorDef(U, S.N, S.F, S.OH, S.OW, output=True), + strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]), + dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]), +): + """Performs 2-D convolution with zero point offsets. + + Layout: + * Input: NCHW. + * Kernel: FCHW. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. This includes the zero + point offsets common to quantized operations. + """ + implements(ConvolutionOpInterface) + domain(D.n, D.f, D.oh, D.ow, D.c, D.kh, D.kw) + O[D.n, D.f, D.oh, D.ow] += ( + TypeFn.cast_signed( + U, I[D.n, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW] + ) + - TypeFn.cast_signed(U, IZp) + ) * (TypeFn.cast_signed(U, K[D.f, D.c, D.kh, D.kw]) - TypeFn.cast_signed(U, KZp)) + @linalg_structured_op def conv_2d_nchw_fchw( I=TensorDef(T1, S.N, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW), diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 146e9780b8eb..1b8969bd1155 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -664,3 +664,33 @@ func.func @winograd_output_dyn(%arg0: tensor<6x6x?x?x?x?xf32>, %arg1: tensor) outs(%arg1 : tensor) -> tensor + +// ----- + +func.func @conv2d_channel_first_q(%img: tensor<100x3x224x224xi32>, %filt: tensor<64x3x5x5xi32>, %a: i32, %b: i32) -> tensor<100x64x220x220xi32> { + %init = arith.constant dense<0> : tensor<100x64x220x220xi32> + %1 = linalg.conv_2d_nchw_fchw_q {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins(%img, %filt, %a, %b : tensor<100x3x224x224xi32>, tensor<64x3x5x5xi32>, i32, i32) + outs(%init : tensor<100x64x220x220xi32>) -> tensor<100x64x220x220xi32> + return %1 : tensor<100x64x220x220xi32> +} + +// CHECK-LABEL: func @conv2d_channel_first_q( +// CHECK: %[[arg0:[a-zA-z0-9]*]]: tensor<100x3x224x224xi32>, %[[arg1:[a-zA-z0-9]*]]: tensor<64x3x5x5xi32>, %[[arg2:[a-zA-z0-9]*]]: i32, %[[arg3:[a-zA-z0-9]*]]: i32) +// CHECK: linalg.conv_2d_nchw_fchw_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]] : tensor<100x3x224x224xi32>, tensor<64x3x5x5xi32>, i32, i32) outs(%{{.*}} : tensor<100x64x220x220xi32>) -> tensor<100x64x220x220xi32> + +// ----- + +func.func @conv2d_channel_first_q_promote(%img: tensor<100x3x224x224xi8>, %filt: tensor<64x3x5x5xi8>, %a: i8, %b: i8) -> tensor<100x64x220x220xi32> { + %init = arith.constant dense<0> : tensor<100x64x220x220xi32> + %1 = linalg.conv_2d_nchw_fchw_q {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins(%img, %filt, %a, %b : tensor<100x3x224x224xi8>, tensor<64x3x5x5xi8>, i8, i8) + outs(%init : tensor<100x64x220x220xi32>) -> tensor<100x64x220x220xi32> + return %1 : tensor<100x64x220x220xi32> +} + +// CHECK-LABEL: func @conv2d_channel_first_q_promote( +// CHECK: %[[arg0:[a-zA-z0-9]*]]: tensor<100x3x224x224xi8>, %[[arg1:[a-zA-z0-9]*]]: tensor<64x3x5x5xi8>, %[[arg2:[a-zA-z0-9]*]]: i8, %[[arg3:[a-zA-z0-9]*]]: i8) +// CHECK: linalg.conv_2d_nchw_fchw_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]] : tensor<100x3x224x224xi8>, tensor<64x3x5x5xi8>, i8, i8) outs(%{{.*}} : tensor<100x64x220x220xi32>) -> tensor<100x64x220x220xi32> -- GitLab From 697a455e6fecf364c1ac4ff9874aefddf2952454 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Sat, 19 Oct 2024 09:38:25 -0700 Subject: [PATCH 159/511] More aggressively deduplicate global warnings based on contents. (#112801) I've been getting complaints from users being spammed by -gmodules missing file warnings going out of control because each object file depends on an entire DAG of PCM files that usually are all missing at once. To reduce this problem, this patch does two things: 1. Module now maintains a DenseMap that is used to display each warning only once, based on its actual text. 2. The PCM warning itself is reworded to include less details, such as the DIE offset, which is only useful to LLDB developers, who can get this from the dwarf log if they need it. Because the detail is omitted the hashing from (1) deduplicates the warnings. rdar://138144624 --- lldb/include/lldb/Core/Module.h | 9 ++++-- lldb/source/Core/Module.cpp | 29 ++++++++++++------- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 25 ++++++++-------- .../SymbolFile/DWARF/TestDedupWarnings.test | 22 ++++++++++++++ 4 files changed, 61 insertions(+), 24 deletions(-) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h index 5589c1c9a350..23257e429ad0 100644 --- a/lldb/include/lldb/Core/Module.h +++ b/lldb/include/lldb/Core/Module.h @@ -30,6 +30,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Chrono.h" @@ -1057,8 +1058,11 @@ protected: /// time for the symbol tables can be aggregated here. StatsDuration m_symtab_index_time; - std::once_flag m_optimization_warning; - std::once_flag m_language_warning; + /// A set of hashes of all warnings and errors, to avoid reporting them + /// multiple times to the same Debugger. + llvm::DenseMap> + m_shown_diagnostics; + std::recursive_mutex m_diagnostic_mutex; void SymbolIndicesToSymbolContextList(Symtab *symtab, std::vector &symbol_indexes, @@ -1086,6 +1090,7 @@ private: void ReportWarning(const llvm::formatv_object_base &payload); void ReportError(const llvm::formatv_object_base &payload); void ReportErrorIfModifyDetected(const llvm::formatv_object_base &payload); + std::once_flag *GetDiagnosticOnceFlag(llvm::StringRef msg); }; } // namespace lldb_private diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp index 88cc957e91fa..03eb81459b29 100644 --- a/lldb/source/Core/Module.cpp +++ b/lldb/source/Core/Module.cpp @@ -1093,8 +1093,8 @@ void Module::ReportWarningOptimization( ss << file_name << " was compiled with optimization - stepping may behave " "oddly; variables may not be available."; - Debugger::ReportWarning(std::string(ss.GetString()), debugger_id, - &m_optimization_warning); + llvm::StringRef msg = ss.GetString(); + Debugger::ReportWarning(msg.str(), debugger_id, GetDiagnosticOnceFlag(msg)); } void Module::ReportWarningUnsupportedLanguage( @@ -1104,8 +1104,8 @@ void Module::ReportWarningUnsupportedLanguage( << Language::GetNameForLanguageType(language) << "\". " "Inspection of frame variables will be limited."; - Debugger::ReportWarning(std::string(ss.GetString()), debugger_id, - &m_language_warning); + llvm::StringRef msg = ss.GetString(); + Debugger::ReportWarning(msg.str(), debugger_id, GetDiagnosticOnceFlag(msg)); } void Module::ReportErrorIfModifyDetected( @@ -1125,20 +1125,29 @@ void Module::ReportErrorIfModifyDetected( } } +std::once_flag *Module::GetDiagnosticOnceFlag(llvm::StringRef msg) { + std::lock_guard guard(m_diagnostic_mutex); + auto &once_ptr = m_shown_diagnostics[llvm::stable_hash_name(msg)]; + if (!once_ptr) + once_ptr = std::make_unique(); + return once_ptr.get(); +} + void Module::ReportError(const llvm::formatv_object_base &payload) { StreamString strm; GetDescription(strm.AsRawOstream(), lldb::eDescriptionLevelBrief); - strm.PutChar(' '); - strm.PutCString(payload.str()); - Debugger::ReportError(strm.GetString().str()); + std::string msg = payload.str(); + strm << ' ' << msg; + Debugger::ReportError(strm.GetString().str(), {}, GetDiagnosticOnceFlag(msg)); } void Module::ReportWarning(const llvm::formatv_object_base &payload) { StreamString strm; GetDescription(strm.AsRawOstream(), lldb::eDescriptionLevelFull); - strm.PutChar(' '); - strm.PutCString(payload.str()); - Debugger::ReportWarning(std::string(strm.GetString())); + std::string msg = payload.str(); + strm << ' ' << msg; + Debugger::ReportWarning(strm.GetString().str(), {}, + GetDiagnosticOnceFlag(msg)); } void Module::LogMessage(Log *log, const llvm::formatv_object_base &payload) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 9287d4baf19e..e5b8eee8d08c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -2069,13 +2069,15 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() { Status error = ModuleList::GetSharedModule(dwo_module_spec, module_sp, nullptr, nullptr, nullptr); if (!module_sp) { + // ReportWarning also rate-limits based on the warning string, + // but in a -gmodules build, each object file has a similar DAG + // of module dependencies that would all be listed here. GetObjectFile()->GetModule()->ReportWarning( - "{0:x16}: unable to locate module needed for external types: " - "{1}\nerror: {2}\nDebugging will be degraded due to missing " - "types. Rebuilding the project will regenerate the needed " - "module files.", - die.GetOffset(), dwo_module_spec.GetFileSpec().GetPath().c_str(), - error.AsCString("unknown error")); + "{0}", error.AsCString("unknown error")); + GetObjectFile()->GetModule()->ReportWarning( + "Unable to locate module needed for external types.\n" + "Debugging will be degraded due to missing types. Rebuilding the " + "project will regenerate the needed module files."); continue; } @@ -2095,12 +2097,11 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() { if (dwo_id != dwo_dwo_id) { GetObjectFile()->GetModule()->ReportWarning( - "{0:x16}: Module {1} is out-of-date (hash mismatch). Type " - "information " - "from this module may be incomplete or inconsistent with the rest of " - "the program. Rebuilding the project will regenerate the needed " - "module files.", - die.GetOffset(), dwo_module_spec.GetFileSpec().GetPath().c_str()); + "Module {0} is out-of-date (hash mismatch).\n" + "Type information from this module may be incomplete or inconsistent " + "with the rest of the program. Rebuilding the project will " + "regenerate the needed module files.", + dwo_module_spec.GetFileSpec().GetPath()); } } } diff --git a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test new file mode 100644 index 000000000000..d4fcf78d01b8 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test @@ -0,0 +1,22 @@ +# REQUIRES: system-darwin +# Test the rate-limiting of module not found warnings. +# RUN: rm -rf %t +# RUN: mkdir -p %t + +# RUN: echo 'module "C" { header "c.h" }' >%t/module.modulemap +# RUN: echo 'struct c {};' >>%t/c.h +# RUN: echo '@import C;' >%t/a.m +# RUN: echo 'struct a { struct c c; } a;' >>%t/a.m +# RUN: echo '@import C;' >%t/b.m +# RUN: echo 'struct b { struct c c; } b;' >>%t/b.m +# RUN: echo 'int main() {}' >>%t/b.m + +# RUN: %clang_host -fmodules -Xclang -fmodules-cache-path=%t/cache -I%t -g -gmodules %t/a.m -o %t/a.o -c +# RUN: %clang_host -fmodules -Xclang -fmodules-cache-path=%t/cache -I%t -g -gmodules %t/b.m -o %t/b.o -c +# RUN: %clang_host %t/a.o %t/b.o -o %t/a.out +# RUN: rm -rf %t/cache +# RUN: %lldb %t/a.out -o "b main" -o run -o "p a" -o "p b" -o q 2>&1 | FileCheck %s +# CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist +# CHECK-NOT: /cache/{{.*}}/C-{.*}.pcm' does not exist +# CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist +# CHECK-NOT: /cache/{{.*}}/C-{.*}.pcm' does not exist -- GitLab From f87f3ad6ea8bb80cba9ce009079e1b6c7486feac Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 19 Oct 2024 10:42:33 -0700 Subject: [PATCH 160/511] [Github] Bump CI compiler version to 19.1.2 (#113016) --- .github/workflows/containers/github-action-ci/stage1.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/containers/github-action-ci/stage1.Dockerfile b/.github/workflows/containers/github-action-ci/stage1.Dockerfile index 73828cc05736..3e2c1ab11d58 100644 --- a/.github/workflows/containers/github-action-ci/stage1.Dockerfile +++ b/.github/workflows/containers/github-action-ci/stage1.Dockerfile @@ -2,7 +2,7 @@ FROM docker.io/library/ubuntu:22.04 as base ENV LLVM_SYSROOT=/opt/llvm FROM base as stage1-toolchain -ENV LLVM_VERSION=18.1.8 +ENV LLVM_VERSION=19.1.2 RUN apt-get update && \ apt-get install -y \ -- GitLab From ef91cd3f018411e0ba7989003d7617041e35f650 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 19 Oct 2024 12:33:03 -0700 Subject: [PATCH 161/511] AMDGPU: Handle folding frame indexes into add with immediate (#110738) --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 17 ++++ llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 6 +- .../CodeGen/AMDGPU/fold-fi-operand-shrink.mir | 3 +- .../fold-operands-frame-index.gfx10.mir | 15 +-- .../AMDGPU/fold-operands-frame-index.mir | 94 +++++-------------- .../CodeGen/AMDGPU/frame-index-elimination.ll | 12 +-- .../materialize-frame-index-sgpr.gfx10.ll | 59 +++++------- .../AMDGPU/materialize-frame-index-sgpr.ll | 52 +++++----- 8 files changed, 99 insertions(+), 159 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1e2c77b08b9a..c912a580854c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -194,6 +194,23 @@ bool SIFoldOperandsImpl::frameIndexMayFold( return false; const unsigned Opc = UseMI.getOpcode(); + switch (Opc) { + case AMDGPU::S_ADD_I32: + case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_CO_U32_e32: + // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have + // to insert the wave size shift at every point we use the index. + // TODO: Fix depending on visit order to fold immediates into the operand + return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() && + MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg()); + case AMDGPU::V_ADD_U32_e64: + case AMDGPU::V_ADD_CO_U32_e64: + return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() && + MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg()); + default: + break; + } + if (TII->isMUBUF(UseMI)) return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); if (!TII->isFLATScratch(UseMI)) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 3e4b43d9cfcd..c5d4ef23070e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4705,8 +4705,7 @@ define amdgpu_ps void @large_offset() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_movk_i32 s0, 0x810 -; GFX10-NEXT: s_addk_i32 s0, 0x3c0 +; GFX10-NEXT: s_movk_i32 s0, 0xbd0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4823,8 +4822,7 @@ define amdgpu_ps void @large_offset() { ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 -; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir index 2b5ec86244ec..8626ac0f23ec 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir @@ -183,8 +183,7 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, [[V_MOV_B32_e32_]], 0, implicit $exec + ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr_32 = V_MOV_B32_e32 16, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir index 0d6511cbfceb..d10dec6ca828 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir @@ -13,8 +13,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -34,8 +33,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_const - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 %stack.0, 128, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -57,8 +55,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, %stack.0, 0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -78,8 +75,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_const - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 128, 0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -99,8 +95,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64___fi_const_v - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, %stack.0, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index aa91a4f9f988..280126a0d7cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -14,8 +14,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_const - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 128, implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 @@ -35,8 +34,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__const_fi - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 @@ -56,8 +54,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__materializedconst_fi - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 256 @@ -101,8 +98,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_1 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 256 @@ -173,8 +169,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -215,21 +210,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $sgpr4 + ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec + ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_ADD_U32_e64 64, %0, 0, implicit $exec $sgpr4 = COPY %1 @@ -246,21 +230,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec - ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $sgpr4 + ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec + ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_ADD_U32_e64 %0, 64, 0, implicit $exec $sgpr4 = COPY %1 @@ -278,8 +251,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -298,21 +270,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec - ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 %0, 64, 0, implicit $exec $vgpr0 = COPY %1 @@ -329,21 +290,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 64, %0, 0, implicit $exec $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 4215ae43345f..e3cd8028422d 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -64,8 +64,8 @@ define void @func_mov_fi_i32_offset() #0 { ; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]] +; FIXME: Should commute and shrink +; GFX9-FLATSCR: v_add_u32_e64 v0, 4, s32 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -164,12 +164,12 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8 ; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}} ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] +; CI: v_add_i32_e64 [[GEP:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] -; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 -; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] -; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] +; GFX9-FLATSCR: v_add_u32_e64 [[GEP:v[0-9]+]], 4, s32 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index e86ef52e413b..302b140e32f3 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1426,17 +1426,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32 ; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART -; GFX10_1-NEXT: ; use alloca0 v1 +; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND @@ -1456,17 +1455,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 ; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32 ; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART -; GFX10_3-NEXT: ; use alloca0 v1 +; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND @@ -1485,19 +1483,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX11-NEXT: scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX11-NEXT: v_writelane_b32 v2, s59, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: v_writelane_b32 v2, s59, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s32 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use alloca0 v1 +; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s59, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX11-NEXT: v_readfirstlane_b32 s59, v1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND @@ -1520,17 +1515,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v3, s32 ; GFX12-NEXT: v_writelane_b32 v2, s59, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use alloca0 v1 +; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX12-NEXT: v_readfirstlane_b32 s59, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x43ec, v3 +; GFX12-NEXT: v_readfirstlane_b32 s59, v1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND @@ -1550,10 +1543,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v0 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x442c, v1 ; GFX8-NEXT: v_writelane_b32 v2, s59, 0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 ; GFX8-NEXT: v_readfirstlane_b32 s59, v0 diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index e9cd94620a6b..308411fa225d 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1582,12 +1582,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6 ; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x4240, v1 ; GFX7-NEXT: v_writelane_b32 v23, s59, 27 ; GFX7-NEXT: v_readfirstlane_b32 s59, v0 ; GFX7-NEXT: s_and_b64 vcc, 0, exec @@ -1723,12 +1721,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 ; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x4240, v1 ; GFX8-NEXT: v_writelane_b32 v23, s59, 27 ; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 vcc, 0, exec @@ -1983,17 +1979,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART -; GFX10_1-NEXT: ; use alloca0 v1 +; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 ; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3 ; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4 ; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5 @@ -2070,17 +2065,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART -; GFX10_3-NEXT: ; use alloca0 v1 +; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 ; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3 ; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4 ; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5 @@ -2156,17 +2150,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v23, s30, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_writelane_b32 v23, s31, 1 -; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 +; GFX11-NEXT: v_writelane_b32 v23, s31, 1 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use alloca0 v1 +; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 ; GFX11-NEXT: v_writelane_b32 v23, s33, 2 ; GFX11-NEXT: v_writelane_b32 v23, s34, 3 ; GFX11-NEXT: v_writelane_b32 v23, s35, 4 @@ -2248,16 +2240,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v23, s30, 0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo -; GFX12-NEXT: v_writelane_b32 v23, s31, 1 ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use alloca0 v1 +; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: v_writelane_b32 v23, s31, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x4200, v1 ; GFX12-NEXT: v_writelane_b32 v23, s33, 2 ; GFX12-NEXT: v_writelane_b32 v23, s34, 3 ; GFX12-NEXT: v_writelane_b32 v23, s35, 4 -- GitLab From 06fce61e03d87fcd6b3c2dfb187cdeeaa0d1e20e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 12 Oct 2024 12:13:40 +0100 Subject: [PATCH 162/511] [X86] X86.td - whitespace cleanup. NFC. --- llvm/lib/Target/X86/X86.td | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 6cf37836f921..d57450d91ea2 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1528,7 +1528,6 @@ def ProcessorFeatures { list ZN3Features = !listconcat(ZN2Features, ZN3AdditionalFeatures); - list ZN4AdditionalTuning = [TuningFastDPWSSD]; list ZN4Tuning = !listconcat(ZN3Tuning, ZN4AdditionalTuning); @@ -1550,7 +1549,6 @@ def ProcessorFeatures { list ZN4Features = !listconcat(ZN3Features, ZN4AdditionalFeatures); - list ZN5Tuning = ZN4Tuning; list ZN5AdditionalFeatures = [FeatureVNNI, FeatureMOVDIRI, @@ -1561,7 +1559,6 @@ def ProcessorFeatures { ]; list ZN5Features = !listconcat(ZN4Features, ZN5AdditionalFeatures); - } //===----------------------------------------------------------------------===// @@ -1910,7 +1907,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features, ProcessorFeatures.ZN3Tuning>; def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features, - ProcessorFeatures.ZN4Tuning>; + ProcessorFeatures.ZN4Tuning>; def : ProcModel<"znver5", Znver4Model, ProcessorFeatures.ZN5Features, ProcessorFeatures.ZN5Tuning>; -- GitLab From 93ec08d62971d51a239fba8468d3cf9cb9e54fb0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 19 Oct 2024 18:38:45 +0100 Subject: [PATCH 163/511] [DAG] Move SIGN_EXTEND_INREG constant folding inside FoldConstantArithmetic Update visitSIGN_EXTEND_INREG to call FoldConstantArithmetic instead of getNode. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 73 ++++++++++--------- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 98eed6b7503d..c892bdcd7fd8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14819,8 +14819,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return DAG.getConstant(0, DL, VT); // fold (sext_in_reg c1) -> c1 - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N0, N1); + if (SDValue C = + DAG.FoldConstantArithmetic(ISD::SIGN_EXTEND_INREG, DL, VT, {N0, N1})) + return C; // If the input is already sign extended, just drop the extension. if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 43d49674297f..55cebc28e492 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6659,6 +6659,44 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (TLI->isCommutativeBinOp(Opcode)) if (GlobalAddressSDNode *GA = dyn_cast(Ops[1])) return FoldSymbolOffset(Opcode, VT, GA, Ops[0].getNode()); + + // fold (sext_in_reg c1) -> c2 + if (Opcode == ISD::SIGN_EXTEND_INREG) { + EVT EVT = cast(Ops[1])->getVT(); + + auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) { + unsigned FromBits = EVT.getScalarSizeInBits(); + Val <<= Val.getBitWidth() - FromBits; + Val.ashrInPlace(Val.getBitWidth() - FromBits); + return getConstant(Val, DL, ConstantVT); + }; + + if (auto *C1 = dyn_cast(Ops[0])) { + const APInt &Val = C1->getAPIntValue(); + return SignExtendInReg(Val, VT); + } + + if (ISD::isBuildVectorOfConstantSDNodes(Ops[0].getNode())) { + SmallVector ScalarOps; + llvm::EVT OpVT = Ops[0].getOperand(0).getValueType(); + for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I) { + SDValue Op = Ops[0].getOperand(I); + if (Op.isUndef()) { + ScalarOps.push_back(getUNDEF(OpVT)); + continue; + } + APInt Val = cast(Op)->getAPIntValue(); + ScalarOps.push_back(SignExtendInReg(Val, OpVT)); + } + return getBuildVector(VT, DL, ScalarOps); + } + + if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR && + isa(Ops[0].getOperand(0))) + return getNode(ISD::SPLAT_VECTOR, DL, VT, + SignExtendInReg(Ops[0].getConstantOperandAPInt(0), + Ops[0].getOperand(0).getValueType())); + } } // This is for vector folding only from here on. @@ -7205,41 +7243,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, "Vector element counts must match in SIGN_EXTEND_INREG"); assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending - - auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) { - unsigned FromBits = EVT.getScalarSizeInBits(); - Val <<= Val.getBitWidth() - FromBits; - Val.ashrInPlace(Val.getBitWidth() - FromBits); - return getConstant(Val, DL, ConstantVT); - }; - - if (N1C) { - const APInt &Val = N1C->getAPIntValue(); - return SignExtendInReg(Val, VT); - } - - if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { - SmallVector Ops; - llvm::EVT OpVT = N1.getOperand(0).getValueType(); - for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - SDValue Op = N1.getOperand(i); - if (Op.isUndef()) { - Ops.push_back(getUNDEF(OpVT)); - continue; - } - ConstantSDNode *C = cast(Op); - APInt Val = C->getAPIntValue(); - Ops.push_back(SignExtendInReg(Val, OpVT)); - } - return getBuildVector(VT, DL, Ops); - } - - if (N1.getOpcode() == ISD::SPLAT_VECTOR && - isa(N1.getOperand(0))) - return getNode( - ISD::SPLAT_VECTOR, DL, VT, - SignExtendInReg(N1.getConstantOperandAPInt(0), - N1.getOperand(0).getValueType())); break; } case ISD::FP_TO_SINT_SAT: -- GitLab From 093d4db2f3c874d4683fb01194b00dbb20e5c713 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sun, 20 Oct 2024 07:09:06 +1100 Subject: [PATCH 164/511] Add "clang-format-on-save-mode" minor mode to clang-format.el (#104533) Add an minor mode which can be optionally used to run clang-format on save. Formatting before saving works well and is convenient to avoid having to remember to manually run clang format. I've written this as it's own package but it's probably better if the functionality is supported by clang-format.el. See: https://github.com/melpa/melpa/pull/8762 --- clang/tools/clang-format/clang-format.el | 57 ++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/clang/tools/clang-format/clang-format.el b/clang/tools/clang-format/clang-format.el index f3da5415f867..fb943b7b722f 100644 --- a/clang/tools/clang-format/clang-format.el +++ b/clang/tools/clang-format/clang-format.el @@ -70,6 +70,20 @@ in such buffers." :safe #'stringp) (make-variable-buffer-local 'clang-format-fallback-style) +(defcustom clang-format-on-save-p 'clang-format-on-save-check-config-exists + "Only reformat on save if this function returns non-nil. + +You may wish to choose one of the following options: +- `always': To always format on save. +- `clang-format-on-save-check-config-exists': + Only reformat when \".clang-format\" exists. + +Otherwise you can set this to a user defined function." + :group 'clang-format + :type 'function + :risky t) +(make-variable-buffer-local 'clang-format-on-save-p) + (defun clang-format--extract (xml-node) "Extract replacements and cursor information from XML-NODE." (unless (and (listp xml-node) (eq (xml-node-name xml-node) 'replacements)) @@ -217,5 +231,48 @@ the function `buffer-file-name'." ;;;###autoload (defalias 'clang-format 'clang-format-region) +;; Format on save minor mode. + +(defun clang-format--on-save-buffer-hook () + "The hook to run on buffer saving to format the buffer." + ;; Demote errors as this is user configurable, we can't be sure it wont error. + (when (with-demoted-errors "clang-format: Error %S" + (funcall clang-format-on-save-p)) + (clang-format-buffer)) + ;; Continue to save. + nil) + +(defun clang-format--on-save-enable () + "Disable the minor mode." + (add-hook 'before-save-hook #'clang-format--on-save-buffer-hook nil t)) + +(defun clang-format--on-save-disable () + "Enable the minor mode." + (remove-hook 'before-save-hook #'clang-format--on-save-buffer-hook t)) + +;; Default value for `clang-format-on-save-p'. +(defun clang-format-on-save-check-config-exists () + "Return non-nil when `.clang-format' is found in a parent directory." + ;; Unlikely but possible this is nil. + (let ((filepath buffer-file-name)) + (cond + (filepath + (not (null (locate-dominating-file (file-name-directory filepath) ".clang-format")))) + (t + nil)))) + +;;;###autoload +(define-minor-mode clang-format-on-save-mode + "Clang-format on save minor mode." + :global nil + :lighter "" + :keymap nil + + (cond + (clang-format-on-save-mode + (clang-format--on-save-enable)) + (t + (clang-format--on-save-disable)))) + (provide 'clang-format) ;;; clang-format.el ends here -- GitLab From 10f6d01e3d6cd6963bb2ec8729ab4f0aff9fdb5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Sat, 19 Oct 2024 22:42:49 +0200 Subject: [PATCH 165/511] [GlobalISel][AArch64] Legalize G_EXTRACT_SUBVECTOR (#112946) for future combines --- .../Target/GlobalISel/SelectionDAGCompat.td | 1 + .../AArch64/GISel/AArch64LegalizerInfo.cpp | 5 + .../GlobalISel/legalizer-info-validation.mir | 4 +- .../CodeGen/AArch64/extract-subvec-combine.ll | 161 +++++++++++++----- 4 files changed, 124 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index d9121cf166e5..2d19e36cc842 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -147,6 +147,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index e9d01602c298..fb6c23a96456 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1282,6 +1282,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower(); + getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR) + .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}}) + .widenScalarOrEltToNextPow2(0) + .immIdx(0); // Inform verifier imm idx 0 is handled. + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 146d1177f469..4d096b7231c7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -652,8 +652,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_EXTRACT_SUBVECTOR (opcode {{[0-9]+}}): 2 type indices, 1 imm index -# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 1, OK # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll index 307974c012a9..43c6e0191146 100644 --- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll +++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll @@ -1,12 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <2 x i32> @and_extract_zext_idx0(<4 x i16> %vec) nounwind { -; CHECK-LABEL: and_extract_zext_idx0: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_extract_zext_idx0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_extract_zext_idx0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) %and = and <2 x i32> %extract, @@ -14,11 +22,18 @@ define <2 x i32> @and_extract_zext_idx0(<4 x i16> %vec) nounwind { } define <4 x i16> @and_extract_sext_idx0(<8 x i8> %vec) nounwind { -; CHECK-LABEL: and_extract_sext_idx0: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_extract_sext_idx0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_extract_sext_idx0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %sext = sext <8 x i8> %vec to <8 x i16> %extract = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %sext, i64 0) %and = and <4 x i16> %extract, @@ -26,12 +41,20 @@ define <4 x i16> @and_extract_sext_idx0(<8 x i8> %vec) nounwind { } define <2 x i32> @and_extract_zext_idx2(<4 x i16> %vec) nounwind { -; CHECK-LABEL: and_extract_zext_idx2: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_extract_zext_idx2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_extract_zext_idx2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 2) %and = and <2 x i32> %extract, @@ -39,12 +62,20 @@ define <2 x i32> @and_extract_zext_idx2(<4 x i16> %vec) nounwind { } define <4 x i16> @and_extract_sext_idx4(<8 x i8> %vec) nounwind { -; CHECK-LABEL: and_extract_sext_idx4: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_extract_sext_idx4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_extract_sext_idx4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %sext = sext <8 x i8> %vec to <8 x i16> %extract = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %sext, i64 4) %and = and <4 x i16> %extract, @@ -52,11 +83,18 @@ define <4 x i16> @and_extract_sext_idx4(<8 x i8> %vec) nounwind { } define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind { -; CHECK-LABEL: sext_extract_zext_idx0: -; CHECK: // %bb.0: -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_zext_idx0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_zext_idx0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) %sext_inreg_step0 = shl <2 x i32> %extract, @@ -80,11 +118,18 @@ define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind { } define <4 x i16> @sext_extract_sext_idx0(<8 x i8> %vec) nounwind { -; CHECK-LABEL: sext_extract_sext_idx0: -; CHECK: // %bb.0: -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_sext_idx0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_sext_idx0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-GI-NEXT: ret %sext = sext <8 x i8> %vec to <8 x i16> %extract = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %sext, i64 0) %sext_inreg_step0 = shl <4 x i16> %extract, @@ -93,12 +138,20 @@ define <4 x i16> @sext_extract_sext_idx0(<8 x i8> %vec) nounwind { } define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind { -; CHECK-LABEL: sext_extract_zext_idx2: -; CHECK: // %bb.0: -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_zext_idx2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_zext_idx2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 2) %sext_inreg_step0 = shl <2 x i32> %extract, @@ -107,12 +160,20 @@ define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind { } define <4 x i16> @sext_extract_sext_idx4(<8 x i8> %vec) nounwind { -; CHECK-LABEL: sext_extract_sext_idx4: -; CHECK: // %bb.0: -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_sext_idx4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_sext_idx4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-GI-NEXT: ret %sext = sext <8 x i8> %vec to <8 x i16> %extract = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %sext, i64 4) %sext_inreg_step0 = shl <4 x i16> %extract, @@ -120,5 +181,15 @@ define <4 x i16> @sext_extract_sext_idx4(<8 x i8> %vec) nounwind { ret <4 x i16> %sext_inreg } +define <8 x i8> @sext_extract_idx(<16 x i8> %vec) nounwind { +; CHECK-LABEL: sext_extract_idx: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %extract = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %vec, i64 0) + ret <8 x i8> %extract +} + declare <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32>, i64) declare <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16>, i64) +declare <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8>, i64) -- GitLab From 8819267747c868309d606f58cb616b05217622eb Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 19 Oct 2024 14:38:40 -0700 Subject: [PATCH 166/511] [InstCombine] Simplify code with SmallMapVector::operator[] (NFC) (#113022) --- llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 72ebd9fbb6d9..d9d41e052a32 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -870,8 +870,7 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { if (ICmp && ICmp->isEquality() && getUnderlyingObject(*U) == Alloca) { // Collect equality icmps of the alloca, and don't treat them as // captures. - auto Res = ICmps.insert({ICmp, 0}); - Res.first->second |= 1u << U->getOperandNo(); + ICmps[ICmp] |= 1u << U->getOperandNo(); return false; } -- GitLab From ca9f396cac0371a398eeef73182987a55a21e4a1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 19 Oct 2024 14:39:25 -0700 Subject: [PATCH 167/511] [lldb] Avoid repeated hash lookups (NFC) (#113024) --- lldb/source/Core/DataFileCache.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lldb/source/Core/DataFileCache.cpp b/lldb/source/Core/DataFileCache.cpp index a8127efc1df0..ef0e07a8b034 100644 --- a/lldb/source/Core/DataFileCache.cpp +++ b/lldb/source/Core/DataFileCache.cpp @@ -264,14 +264,12 @@ bool CacheSignature::Decode(const lldb_private::DataExtractor &data, } uint32_t ConstStringTable::Add(ConstString s) { - auto pos = m_string_to_offset.find(s); - if (pos != m_string_to_offset.end()) - return pos->second; - const uint32_t offset = m_next_offset; - m_strings.push_back(s); - m_string_to_offset[s] = offset; - m_next_offset += s.GetLength() + 1; - return offset; + auto [pos, inserted] = m_string_to_offset.try_emplace(s, m_next_offset); + if (inserted) { + m_strings.push_back(s); + m_next_offset += s.GetLength() + 1; + } + return pos->second; } static const llvm::StringRef kStringTableIdentifier("STAB"); -- GitLab From f4136b326514b0732054e17eadc646b45925192d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 19 Oct 2024 14:42:25 -0700 Subject: [PATCH 168/511] [llvm-diff] Avoid repeated hash lookups (NFC) (#113025) --- llvm/tools/llvm-diff/lib/DifferenceEngine.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp index 05cae4b67d7e..9be0eec7b73f 100644 --- a/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp +++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp @@ -189,11 +189,11 @@ class FunctionDifferenceEngine { // The returned reference is not permanently valid and should not be stored. BlockDiffCandidate &getOrCreateBlockDiffCandidate(const BasicBlock *LBB, const BasicBlock *RBB) { - auto It = BlockDiffCandidateIndices.find(LBB); + auto [It, Inserted] = + BlockDiffCandidateIndices.try_emplace(LBB, BlockDiffCandidates.size()); // Check if LBB already has a diff candidate - if (It == BlockDiffCandidateIndices.end()) { + if (Inserted) { // Add new one - BlockDiffCandidateIndices[LBB] = BlockDiffCandidates.size(); BlockDiffCandidates.push_back( {LBB, RBB, SmallDenseMap(), false}); return BlockDiffCandidates.back(); -- GitLab From b26df3e463cd1d65adadcd469fcd4b203484e39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 20 Oct 2024 00:49:16 +0300 Subject: [PATCH 169/511] Revert "[DAG] isConstantIntBuildVectorOrConstantInt - peek through bitcasts (#112710)" This reverts commit a630771b28f4b252e2754776b8f3ab416133951a. This caused compilation to hang for Windows/ARM, see https://github.com/llvm/llvm-project/pull/112710 for details. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 5 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 ++- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 47 ++- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/test/CodeGen/X86/avx2-arith.ll | 2 +- llvm/test/CodeGen/X86/combine-sra.ll | 9 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 50 ++-- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 60 ++-- .../CodeGen/X86/min-legal-vector-width.ll | 18 +- llvm/test/CodeGen/X86/pmul.ll | 62 ++-- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 2 +- llvm/test/CodeGen/X86/psubus.ll | 81 ++--- llvm/test/CodeGen/X86/sat-add.ll | 4 +- .../X86/vector-shuffle-combining-sse41.ll | 6 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 275 +++++++++-------- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 275 +++++++++-------- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 279 ++++++++++-------- 18 files changed, 650 insertions(+), 570 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 12ff36c89e33..b8f80738486a 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2301,11 +2301,10 @@ public: Align getEVTAlign(EVT MemoryVT) const; /// Test whether the given value is a constant int or similar node. - bool isConstantIntBuildVectorOrConstantInt(SDValue N, - bool AllowOpaques = true) const; + SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const; /// Test whether the given value is a constant FP or similar node. - bool isConstantFPBuildVectorOrConstantFP(SDValue N) const; + SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ; /// \returns true if \p N is any kind of constant or build_vector of /// constants, int or float. If a vector, it may not necessarily be a splat. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c892bdcd7fd8..f89734fb43e9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1205,13 +1205,13 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); - if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) { + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) { SDNodeFlags NewFlags; if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) NewFlags.setNoUnsignedWrap(true); - if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1})) return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags); @@ -9931,10 +9931,10 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { // fold (rot* (rot* x, c2), c1) // -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { - bool C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); - bool C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); - if (C1 && C2 && N1.getValueType() == N0.getOperand(1).getValueType()) { - EVT ShiftVT = N1.getValueType(); + SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); + SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); + if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { + EVT ShiftVT = C1->getValueType(0); bool SameSide = (N->getOpcode() == NextOp); unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); @@ -16806,8 +16806,8 @@ SDValue DAGCombiner::visitVP_FADD(SDNode *N) { SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); - bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); + SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); + SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -16904,8 +16904,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { - bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); - bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); + SDNode *CFP00 = + DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + SDNode *CFP01 = + DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { @@ -16925,8 +16927,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FMUL) { - bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); - bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); + SDNode *CFP10 = + DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + SDNode *CFP11 = + DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { @@ -16946,7 +16950,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N0.getOpcode() == ISD::FADD) { - bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + SDNode *CFP00 = + DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { @@ -16956,7 +16961,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FADD) { - bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + SDNode *CFP10 = + DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 55cebc28e492..4b6477957063 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7000,10 +7000,10 @@ void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1, // Canonicalize: // binop(const, nonconst) -> binop(nonconst, const) - bool N1C = isConstantIntBuildVectorOrConstantInt(N1); - bool N2C = isConstantIntBuildVectorOrConstantInt(N2); - bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); - bool N2CFP = isConstantFPBuildVectorOrConstantFP(N2); + SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1); + SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2); + SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1); + SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2); if ((N1C && !N2C) || (N1CFP && !N2CFP)) std::swap(N1, N2); @@ -13200,44 +13200,39 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { return true; } -// Returns true if it is a constant integer BuildVector or constant integer, -// possibly hidden by a bitcast. -bool SelectionDAG::isConstantIntBuildVectorOrConstantInt( - SDValue N, bool AllowOpaques) const { - N = peekThroughBitcasts(N); - - if (auto *C = dyn_cast(N)) - return AllowOpaques || !C->isOpaque(); - +// Returns the SDNode if it is a constant integer BuildVector +// or constant integer. +SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const { + if (isa(N)) + return N.getNode(); if (ISD::isBuildVectorOfConstantSDNodes(N.getNode())) - return true; - + return N.getNode(); // Treat a GlobalAddress supporting constant offset folding as a // constant integer. - if (auto *GA = dyn_cast(N)) + if (GlobalAddressSDNode *GA = dyn_cast(N)) if (GA->getOpcode() == ISD::GlobalAddress && TLI->isOffsetFoldingLegal(GA)) - return true; - + return GA; if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return true; - return false; + return N.getNode(); + return nullptr; } -// Returns true if it is a constant float BuildVector or constant float. -bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { +// Returns the SDNode if it is a constant float BuildVector +// or constant float. +SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { if (isa(N)) - return true; + return N.getNode(); if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) - return true; + return N.getNode(); if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return true; + return N.getNode(); - return false; + return nullptr; } std::optional SelectionDAG::isBoolConstant(SDValue N, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7448416c682a..d5466e0a1cbd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20760,7 +20760,7 @@ static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) { if (!Add.hasOneUse()) return SDValue(); - if (DAG.isConstantIntBuildVectorOrConstantInt(X)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X))) return SDValue(); SDValue M1 = Add.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bcb84add65d8..08321024fb65 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56546,9 +56546,14 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); + // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { - return DAG.isConstantIntBuildVectorOrConstantInt(Op, - /*AllowOpaques*/ false); + if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) { + if (auto *Cst = dyn_cast(C)) + return !Cst->isOpaque(); + return true; + } + return false; }; // X86 can't encode an immediate LHS of a sub. See if we can push the diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 44ab33ad67f2..90733dfb8465 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3 ; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 ; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index c982884314f6..7eee418742dd 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -725,11 +725,12 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519] -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index ee83a79b6dd5..6fd3db3464de 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 ; SSE41-NEXT: pand %xmm2, %xmm5 @@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3139,7 +3139,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3287,8 +3287,8 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3311,7 +3311,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3356,7 +3356,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3376,7 +3376,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index b4e8f0a23016..5a1c4c8a52c8 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1914,7 +1914,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -1922,7 +1922,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -1944,7 +1944,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1974,14 +1974,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -1999,7 +1999,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2088,7 +2088,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 @@ -2096,7 +2096,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2120,7 +2120,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2150,14 +2150,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2176,7 +2176,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2266,7 +2266,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2274,7 +2274,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2297,7 +2297,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2328,14 +2328,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2354,7 +2354,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2444,7 +2444,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2452,7 +2452,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2475,7 +2475,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2506,14 +2506,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2532,7 +2532,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2623,7 +2623,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2631,7 +2631,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2655,7 +2655,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2687,14 +2687,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2714,7 +2714,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 9b08d8baacee..8289e885618f 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -892,13 +892,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -913,13 +913,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 +; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 +; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 @@ -939,13 +939,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -967,7 +967,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 @@ -980,7 +980,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 @@ -997,7 +997,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index fe8a4fa16312..6c3d04863118 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -161,8 +161,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE41-NEXT: pand %xmm2, %xmm4 @@ -586,16 +586,17 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm6 ; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm5 @@ -608,7 +609,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -620,7 +621,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm3 ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -901,34 +902,37 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm9, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm10 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pandn %xmm4, %xmm9 ; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pandn %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 @@ -941,14 +945,14 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm5 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 @@ -959,28 +963,28 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm6 ; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index 885b07585e68..c9bb3de92dcd 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -59,7 +59,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX256BW-NEXT: vpand %ymm1, %ymm2, %ymm3 ; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 9656822d144e..be8adf697d5c 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1671,11 +1671,12 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm4, %xmm8 -; SSE41-NEXT: pxor %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 @@ -1683,20 +1684,22 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] ; SSE41-NEXT: movapd %xmm8, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2768,11 +2771,12 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -2780,10 +2784,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2792,10 +2797,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm7 ; SSE41-NEXT: psubd %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2991,11 +2997,12 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -3003,10 +3010,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -3015,10 +3023,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index b12be7cb129d..949902a5ebc4 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -631,8 +631,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index d3e4906450e4..efe34c52b371 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -84,8 +84,8 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 @@ -120,7 +120,7 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 0af5e9aeccd9..5568604ac29a 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -57,8 +57,8 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -175,8 +175,8 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -317,12 +317,12 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -330,8 +330,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -584,32 +584,35 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -617,8 +620,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -825,8 +828,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -968,8 +971,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1140,12 +1143,12 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1153,8 +1156,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1330,12 +1333,12 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1343,8 +1346,8 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1580,32 +1583,35 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1613,8 +1619,8 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -2233,8 +2239,8 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2387,8 +2393,8 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2533,12 +2539,12 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2546,8 +2552,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2727,12 +2733,12 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2740,8 +2746,8 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2981,32 +2987,35 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3014,8 +3023,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3268,32 +3277,35 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3301,8 +3313,8 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3665,72 +3677,79 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 -; SSE41-NEXT: pxor %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3738,8 +3757,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 3c03c521c272..d276a6873012 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -59,8 +59,8 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -182,8 +182,8 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -334,12 +334,12 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -347,8 +347,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -604,32 +604,35 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -637,8 +640,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -846,8 +849,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -980,8 +983,8 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1146,12 +1149,12 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1159,8 +1162,8 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1330,12 +1333,12 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1343,8 +1346,8 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1576,32 +1579,35 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1609,8 +1615,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -1996,8 +2002,8 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2142,8 +2148,8 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2282,12 +2288,12 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2295,8 +2301,8 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2470,12 +2476,12 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2483,8 +2489,8 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2720,32 +2726,35 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -2753,8 +2762,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3013,32 +3022,35 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3046,8 +3058,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3418,72 +3430,79 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 -; SSE41-NEXT: pxor %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 -; SSE41-NEXT: pxor %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3491,8 +3510,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index c1d22dc7daf2..412661693747 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -207,20 +207,20 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pand %xmm6, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -407,31 +407,34 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -787,25 +790,26 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -920,25 +924,26 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm6, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm7, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rdi) ; SSE41-NEXT: retq @@ -1089,31 +1094,34 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -1861,25 +1869,26 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1996,25 +2005,26 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm6, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm7, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm4 ; SSE41-NEXT: movd %xmm4, (%rdi) @@ -2165,31 +2175,34 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2347,31 +2360,34 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: packusdw %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm5, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2586,40 +2602,44 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm11 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm13 -; SSE41-NEXT: pxor %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm13 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pxor %xmm7, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 ; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm7, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pxor %xmm7, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm11 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm11, %xmm0 @@ -2627,29 +2647,32 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 ; SSE41-NEXT: packusdw %xmm12, %xmm11 ; SSE41-NEXT: packusdw %xmm11, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pxor %xmm7, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm7, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 ; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm7, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -- GitLab From 2eb1699184cf4d5de69f7825f66d7b3c04827f77 Mon Sep 17 00:00:00 2001 From: Tor Shepherd Date: Sat, 19 Oct 2024 18:19:01 -0400 Subject: [PATCH 170/511] [clangd] Add inlay hints for default function arguments (#95712) The new inlay hints have the `DefaultArguments` kind and can be enabled in config similar to other inlay kint kinds. --- clang-tools-extra/clangd/Config.h | 1 + clang-tools-extra/clangd/ConfigCompile.cpp | 6 +- clang-tools-extra/clangd/ConfigFragment.h | 3 + clang-tools-extra/clangd/ConfigYAML.cpp | 5 +- clang-tools-extra/clangd/InlayHints.cpp | 78 +++++++++++++++++-- clang-tools-extra/clangd/Protocol.cpp | 3 + clang-tools-extra/clangd/Protocol.h | 9 +++ .../clangd/unittests/InlayHintTests.cpp | 73 +++++++++++++++++ clang-tools-extra/docs/ReleaseNotes.rst | 2 + 9 files changed, 170 insertions(+), 10 deletions(-) diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h index 8fcbc5c33469..e174f7fabe34 100644 --- a/clang-tools-extra/clangd/Config.h +++ b/clang-tools-extra/clangd/Config.h @@ -162,6 +162,7 @@ struct Config { bool DeducedTypes = true; bool Designators = true; bool BlockEnd = false; + bool DefaultArguments = false; // Limit the length of type names in inlay hints. (0 means no limit) uint32_t TypeNameLimit = 32; } InlayHints; diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp index 58610a5b8792..fb7692998d05 100644 --- a/clang-tools-extra/clangd/ConfigCompile.cpp +++ b/clang-tools-extra/clangd/ConfigCompile.cpp @@ -43,7 +43,6 @@ #include "llvm/Support/Regex.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" -#include #include #include #include @@ -669,6 +668,11 @@ struct FragmentCompiler { Out.Apply.push_back([Value(**F.BlockEnd)](const Params &, Config &C) { C.InlayHints.BlockEnd = Value; }); + if (F.DefaultArguments) + Out.Apply.push_back( + [Value(**F.DefaultArguments)](const Params &, Config &C) { + C.InlayHints.DefaultArguments = Value; + }); if (F.TypeNameLimit) Out.Apply.push_back( [Value(**F.TypeNameLimit)](const Params &, Config &C) { diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h index fc1b45f5d4c3..36f7d04231c4 100644 --- a/clang-tools-extra/clangd/ConfigFragment.h +++ b/clang-tools-extra/clangd/ConfigFragment.h @@ -339,6 +339,9 @@ struct Fragment { std::optional> Designators; /// Show defined symbol names at the end of a definition block. std::optional> BlockEnd; + /// Show parameter names and default values of default arguments after all + /// of the explicit arguments. + std::optional> DefaultArguments; /// Limit the length of type name hints. (0 means no limit) std::optional> TypeNameLimit; }; diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp index bcdda99eeed6..32e028981d42 100644 --- a/clang-tools-extra/clangd/ConfigYAML.cpp +++ b/clang-tools-extra/clangd/ConfigYAML.cpp @@ -14,7 +14,6 @@ #include "llvm/Support/YAMLParser.h" #include #include -#include namespace clang { namespace clangd { @@ -268,6 +267,10 @@ private: if (auto Value = boolValue(N, "BlockEnd")) F.BlockEnd = *Value; }); + Dict.handle("DefaultArguments", [&](Node &N) { + if (auto Value = boolValue(N, "DefaultArguments")) + F.DefaultArguments = *Value; + }); Dict.handle("TypeNameLimit", [&](Node &N) { if (auto Value = uint32Value(N, "TypeNameLimit")) F.TypeNameLimit = *Value; diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp index cd4f1931b3ce..c4053fced81d 100644 --- a/clang-tools-extra/clangd/InlayHints.cpp +++ b/clang-tools-extra/clangd/InlayHints.cpp @@ -11,9 +11,11 @@ #include "Config.h" #include "HeuristicResolver.h" #include "ParsedAST.h" +#include "Protocol.h" #include "SourceCode.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" #include "clang/AST/DeclarationName.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" @@ -23,15 +25,22 @@ #include "clang/AST/Type.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/OperatorKinds.h" +#include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/raw_ostream.h" +#include +#include #include #include @@ -372,6 +381,23 @@ maybeDropCxxExplicitObjectParameters(ArrayRef Params) { return Params; } +template +std::string joinAndTruncate(const R &Range, size_t MaxLength) { + std::string Out; + llvm::raw_string_ostream OS(Out); + llvm::ListSeparator Sep(", "); + for (auto &&Element : Range) { + OS << Sep; + if (Out.size() + Element.size() >= MaxLength) { + OS << "..."; + break; + } + OS << Element; + } + OS.flush(); + return Out; +} + struct Callee { // Only one of Decl or Loc is set. // Loc is for calls through function pointers. @@ -422,7 +448,8 @@ public: Callee.Decl = E->getConstructor(); if (!Callee.Decl) return true; - processCall(Callee, {E->getArgs(), E->getNumArgs()}); + processCall(Callee, E->getParenOrBraceRange().getEnd(), + {E->getArgs(), E->getNumArgs()}); return true; } @@ -495,7 +522,7 @@ public: dyn_cast_or_null(Callee.Decl)) if (IsFunctor || Method->hasCXXExplicitFunctionObjectParameter()) Args = Args.drop_front(1); - processCall(Callee, Args); + processCall(Callee, E->getRParenLoc(), Args); return true; } @@ -709,10 +736,12 @@ public: private: using NameVec = SmallVector; - void processCall(Callee Callee, llvm::ArrayRef Args) { + void processCall(Callee Callee, SourceLocation RParenOrBraceLoc, + llvm::ArrayRef Args) { assert(Callee.Decl || Callee.Loc); - if (!Cfg.InlayHints.Parameters || Args.size() == 0) + if ((!Cfg.InlayHints.Parameters && !Cfg.InlayHints.DefaultArguments) || + Args.size() == 0) return; // The parameter name of a move or copy constructor is not very interesting. @@ -721,6 +750,9 @@ private: if (Ctor->isCopyOrMoveConstructor()) return; + SmallVector FormattedDefaultArgs; + bool HasNonDefaultArgs = false; + ArrayRef Params, ForwardedParams; // Resolve parameter packs to their forwarded parameter SmallVector ForwardedParamsStorage; @@ -752,15 +784,44 @@ private: } StringRef Name = ParameterNames[I]; - bool NameHint = shouldHintName(Args[I], Name); - bool ReferenceHint = shouldHintReference(Params[I], ForwardedParams[I]); - - if (NameHint || ReferenceHint) { + const bool NameHint = + shouldHintName(Args[I], Name) && Cfg.InlayHints.Parameters; + const bool ReferenceHint = + shouldHintReference(Params[I], ForwardedParams[I]) && + Cfg.InlayHints.Parameters; + + const bool IsDefault = isa(Args[I]); + HasNonDefaultArgs |= !IsDefault; + if (IsDefault) { + if (Cfg.InlayHints.DefaultArguments) { + const auto SourceText = Lexer::getSourceText( + CharSourceRange::getTokenRange(Params[I]->getDefaultArgRange()), + AST.getSourceManager(), AST.getLangOpts()); + const auto Abbrev = + (SourceText.size() > Cfg.InlayHints.TypeNameLimit || + SourceText.contains("\n")) + ? "..." + : SourceText; + if (NameHint) + FormattedDefaultArgs.emplace_back( + llvm::formatv("{0}: {1}", Name, Abbrev)); + else + FormattedDefaultArgs.emplace_back(llvm::formatv("{0}", Abbrev)); + } + } else if (NameHint || ReferenceHint) { addInlayHint(Args[I]->getSourceRange(), HintSide::Left, InlayHintKind::Parameter, ReferenceHint ? "&" : "", NameHint ? Name : "", ": "); } } + + if (!FormattedDefaultArgs.empty()) { + std::string Hint = + joinAndTruncate(FormattedDefaultArgs, Cfg.InlayHints.TypeNameLimit); + addInlayHint(SourceRange{RParenOrBraceLoc}, HintSide::Left, + InlayHintKind::DefaultArgument, + HasNonDefaultArgs ? ", " : "", Hint, ""); + } } static bool isSetter(const FunctionDecl *Callee, const NameVec &ParamNames) { @@ -968,6 +1029,7 @@ private: CHECK_KIND(Type, DeducedTypes); CHECK_KIND(Designator, Designators); CHECK_KIND(BlockEnd, BlockEnd); + CHECK_KIND(DefaultArgument, DefaultArguments); #undef CHECK_KIND } diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp index c08f80442eaa..295ccd26a404 100644 --- a/clang-tools-extra/clangd/Protocol.cpp +++ b/clang-tools-extra/clangd/Protocol.cpp @@ -1477,6 +1477,7 @@ llvm::json::Value toJSON(const InlayHintKind &Kind) { return 2; case InlayHintKind::Designator: case InlayHintKind::BlockEnd: + case InlayHintKind::DefaultArgument: // This is an extension, don't serialize. return nullptr; } @@ -1517,6 +1518,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, InlayHintKind Kind) { return "designator"; case InlayHintKind::BlockEnd: return "block-end"; + case InlayHintKind::DefaultArgument: + return "default-argument"; } llvm_unreachable("Unknown clang.clangd.InlayHintKind"); }; diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h index a0f8b04bc4ff..5b2809575819 100644 --- a/clang-tools-extra/clangd/Protocol.h +++ b/clang-tools-extra/clangd/Protocol.h @@ -1681,6 +1681,15 @@ enum class InlayHintKind { /// This is a clangd extension. BlockEnd = 4, + /// An inlay hint that is for a default argument. + /// + /// An example of a parameter hint for a default argument: + /// void foo(bool A = true); + /// foo(^); + /// Adds an inlay hint "A: true". + /// This is a clangd extension. + DefaultArgument = 6, + /// Other ideas for hints that are not currently implemented: /// /// * Chaining hints, showing the types of intermediate expressions diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp index a5a349e93037..73dd273d6c39 100644 --- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp +++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp @@ -15,9 +15,12 @@ #include "support/Context.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include #include +#include #include namespace clang { @@ -81,6 +84,7 @@ Config noHintsConfig() { C.InlayHints.DeducedTypes = false; C.InlayHints.Designators = false; C.InlayHints.BlockEnd = false; + C.InlayHints.DefaultArguments = false; return C; } @@ -1465,6 +1469,75 @@ TEST(TypeHints, DefaultTemplateArgs) { ExpectedHint{": A", "binding"}); } +TEST(DefaultArguments, Smoke) { + Config Cfg; + Cfg.InlayHints.Parameters = + true; // To test interplay of parameters and default parameters + Cfg.InlayHints.DeducedTypes = false; + Cfg.InlayHints.Designators = false; + Cfg.InlayHints.BlockEnd = false; + + Cfg.InlayHints.DefaultArguments = true; + WithContextValue WithCfg(Config::Key, std::move(Cfg)); + + const auto *Code = R"cpp( + int foo(int A = 4) { return A; } + int bar(int A, int B = 1, bool C = foo($default1[[)]]) { return A; } + int A = bar($explicit[[2]]$default2[[)]]; + + void baz(int = 5) { if (false) baz($unnamed[[)]]; }; + )cpp"; + + assertHints(InlayHintKind::DefaultArgument, Code, + ExpectedHint{"A: 4", "default1", Left}, + ExpectedHint{", B: 1, C: foo()", "default2", Left}, + ExpectedHint{"5", "unnamed", Left}); + + assertHints(InlayHintKind::Parameter, Code, + ExpectedHint{"A: ", "explicit", Left}); +} + +TEST(DefaultArguments, WithoutParameterNames) { + Config Cfg; + Cfg.InlayHints.Parameters = false; // To test just default args this time + Cfg.InlayHints.DeducedTypes = false; + Cfg.InlayHints.Designators = false; + Cfg.InlayHints.BlockEnd = false; + + Cfg.InlayHints.DefaultArguments = true; + WithContextValue WithCfg(Config::Key, std::move(Cfg)); + + const auto *Code = R"cpp( + struct Baz { + Baz(float a = 3 // + + 2); + }; + struct Foo { + Foo(int, Baz baz = // + Baz{$abbreviated[[}]] + + // + ) {} + }; + + int main() { + Foo foo1(1$paren[[)]]; + Foo foo2{2$brace1[[}]]; + Foo foo3 = {3$brace2[[}]]; + auto foo4 = Foo{4$brace3[[}]]; + } + )cpp"; + + assertHints(InlayHintKind::DefaultArgument, Code, + ExpectedHint{"...", "abbreviated", Left}, + ExpectedHint{", Baz{}", "paren", Left}, + ExpectedHint{", Baz{}", "brace1", Left}, + ExpectedHint{", Baz{}", "brace2", Left}, + ExpectedHint{", Baz{}", "brace3", Left}); + + assertHints(InlayHintKind::Parameter, Code); +} + TEST(TypeHints, Deduplication) { assertTypeHints(R"cpp( template diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index e8148e06b6af..a9b1ab367f53 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -56,6 +56,8 @@ Improvements to clangd Inlay hints ^^^^^^^^^^^ +- Added `DefaultArguments` Inlay Hints option. + Diagnostics ^^^^^^^^^^^ -- GitLab From dde26e361f50df4b999ac117222c74f2c100f817 Mon Sep 17 00:00:00 2001 From: Xing Xue Date: Sat, 19 Oct 2024 18:41:06 -0400 Subject: [PATCH 171/511] [libunwind][AIX] Call dlclose only when dlsym() fails (#112768) The personality routine `__xlcxx_personality_v0` in `libc++abi` is hard-coded in the unwinder as the handler for EH in applications generated by the legacy IBM C++ compiler. The symbol is resolved dynamically using `dlopen` to avoid a hard dependency of `libunwind` on `libc++abi` for cases such as non-C++ applications. However, `dlclose` was incorrectly called after `dlsym` succeeded, potentially invalidating the function pointer obtained from `dlsym` when the memory allocated for the `dlopen` is reclaimed. This PR changes to call `dlclose` only when `dlsym` fails. --- libunwind/src/UnwindCursor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index 2a3aba28fb6c..32e6fb43d988 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -2150,9 +2150,9 @@ bool UnwindCursor::getInfoFromTBTable(pint_t pc, R ®isters) { dlsym(libHandle, "__xlcxx_personality_v0")); if (xlcPersonalityV0 == NULL) { _LIBUNWIND_TRACE_UNWINDING("dlsym() failed with errno=%d\n", errno); + dlclose(libHandle); assert(0 && "dlsym() failed"); } - dlclose(libHandle); errno = saveErrno; } xlcPersonalityV0InitLock.unlock(); -- GitLab From cd938bf3279b6d2f1c0a8c82b6371a384d744378 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Sat, 19 Oct 2024 20:53:29 -0300 Subject: [PATCH 172/511] [lldb] Introduce Language::AreEquivalentFunctions (#112720) This allows languages to provide an opinion on whether two symbol contexts are equivalent (i.e. belong to the same function). It is useful to drive the comparisons done by stepping plans that need to ensure symbol contexts obtained from different points in time are actually the same. --- lldb/include/lldb/Target/Language.h | 9 +++++++++ lldb/source/Target/ThreadPlanStepOverRange.cpp | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h index 41d8eeef469e..c9cddee6baa2 100644 --- a/lldb/include/lldb/Target/Language.h +++ b/lldb/include/lldb/Target/Language.h @@ -363,6 +363,15 @@ public: return false; } + /// Returns a boolean indicating whether two symbol contexts are equal for the + /// purposes of frame comparison. If the plugin has no opinion, it should + /// return nullopt. + virtual std::optional + AreEqualForFrameComparison(const SymbolContext &sc1, + const SymbolContext &sc2) const { + return {}; + } + /// Returns true if this Language supports exception breakpoints on throw via /// a corresponding LanguageRuntime plugin. virtual bool SupportsExceptionBreakpointsOnThrow() const { return false; } diff --git a/lldb/source/Target/ThreadPlanStepOverRange.cpp b/lldb/source/Target/ThreadPlanStepOverRange.cpp index 934f23b3b21a..ef5b4b5c434d 100644 --- a/lldb/source/Target/ThreadPlanStepOverRange.cpp +++ b/lldb/source/Target/ThreadPlanStepOverRange.cpp @@ -11,6 +11,7 @@ #include "lldb/Symbol/CompileUnit.h" #include "lldb/Symbol/Function.h" #include "lldb/Symbol/LineTable.h" +#include "lldb/Target/Language.h" #include "lldb/Target/Process.h" #include "lldb/Target/RegisterContext.h" #include "lldb/Target/Target.h" @@ -103,6 +104,10 @@ void ThreadPlanStepOverRange::SetupAvoidNoDebug( bool ThreadPlanStepOverRange::IsEquivalentContext( const SymbolContext &context) { + if (Language *language = Language::FindPlugin(context.GetLanguage())) + if (std::optional maybe_equivalent = + language->AreEqualForFrameComparison(context, m_addr_context)) + return *maybe_equivalent; // Match as much as is specified in the m_addr_context: This is a fairly // loose sanity check. Note, sometimes the target doesn't get filled in so I // left out the target check. And sometimes the module comes in as the .o -- GitLab From 2deb3a26fa47a4640962489e5473726d7a8bf12b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 19 Oct 2024 18:11:06 -0700 Subject: [PATCH 173/511] [LV] Fixup IV users only once during epilogue vectorization. (NFC) Induction users only need to be updated when vectorizing the epilogue. Avoid running fixupIVUsers when vectorizing the main loop during epilogue vectorization. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ce0903b838aa..a95ac032b1ff 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -539,10 +539,10 @@ protected: friend class LoopVectorizationPlanner; /// Set up the values of the IVs correctly when exiting the vector loop. - void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *VectorTripCount, Value *EndValue, - BasicBlock *MiddleBlock, VPlan &Plan, - VPTransformState &State); + virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, VPlan &Plan, + VPTransformState &State); /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. @@ -770,6 +770,11 @@ protected: BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; + + void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + Value *VectorTripCount, Value *EndValue, + BasicBlock *MiddleBlock, VPlan &Plan, + VPTransformState &State) override {}; }; // A specialized derived class of inner loop vectorizer that performs -- GitLab From fe8af49a1bf73055941d7aba5d1d2f8e894e8022 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 20 Oct 2024 01:38:16 +0000 Subject: [PATCH 174/511] [ELF] Pass Ctx & to Defined & CommonSymbol --- lld/ELF/Arch/ARM.cpp | 6 +++--- lld/ELF/Arch/PPC64.cpp | 2 +- lld/ELF/Driver.cpp | 2 +- lld/ELF/InputFiles.cpp | 25 +++++++++++++------------ lld/ELF/LinkerScript.cpp | 6 +++--- lld/ELF/Relocations.cpp | 10 +++++----- lld/ELF/Symbols.h | 7 ++++--- lld/ELF/SyntheticSections.cpp | 8 ++++---- lld/ELF/Writer.cpp | 18 +++++++++--------- 9 files changed, 43 insertions(+), 41 deletions(-) diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 9bb3604ce61c..1cc396aa395d 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1439,8 +1439,8 @@ void ArmCmseSGSection::finalizeContents() { for (size_t i = 0; i < sgVeneers.size(); ++i) { ArmCmseSGVeneer *s = sgVeneers[i]; s->offset = i * s->size; - Defined(file, StringRef(), s->sym->binding, s->sym->stOther, s->sym->type, - s->offset | 1, s->size, this) + Defined(ctx, file, StringRef(), s->sym->binding, s->sym->stOther, + s->sym->type, s->offset | 1, s->size, this) .overwrite(*s->sym); } } @@ -1474,7 +1474,7 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { for (auto &p : ctx.symtab->cmseSymMap) { Defined *d = cast(p.second.sym); impSymTab->addSymbol(makeDefined( - ctx.internalFile, d->getName(), d->computeBinding(ctx), + ctx, ctx.internalFile, d->getName(), d->computeBinding(ctx), /*stOther=*/0, STT_FUNC, d->getVA(), d->getSize(), nullptr)); } diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index d937492fe440..9f550745f93b 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -254,7 +254,7 @@ static bool addOptional(Ctx &ctx, StringRef name, uint64_t value, Symbol *sym = ctx.symtab->find(name); if (!sym || sym->isDefined()) return false; - sym->resolve(ctx, Defined{ctx.internalFile, StringRef(), STB_GLOBAL, + sym->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN, STT_FUNC, value, /*size=*/0, /*section=*/nullptr}); defined.push_back(cast(sym)); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index fb77e67e9fc5..0d7712f904da 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2375,7 +2375,7 @@ static void replaceCommonSymbols(Ctx &ctx) { auto *bss = make(ctx, "COMMON", s->size, s->alignment); bss->file = s->file; ctx.inputSections.push_back(bss); - Defined(s->file, StringRef(), s->binding, s->stOther, s->type, + Defined(ctx, s->file, StringRef(), s->binding, s->stOther, s->type, /*value=*/0, s->size, bss) .overwrite(*s); } diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 3d02ef8b77ab..0d3db3731388 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1156,14 +1156,14 @@ void ObjFile::initializeSymbols(const object::ELFFile &obj) { fatal(toString(this) + ": common symbol '" + sym->getName() + "' has invalid alignment: " + Twine(value)); hasCommonSyms = true; - sym->resolve(ctx, CommonSymbol{this, StringRef(), binding, stOther, type, - value, size}); + sym->resolve(ctx, CommonSymbol{ctx, this, StringRef(), binding, stOther, + type, value, size}); continue; } // Handle global defined symbols. Defined::section will be set in postParse. - sym->resolve(ctx, Defined{this, StringRef(), binding, stOther, type, value, - size, nullptr}); + sym->resolve(ctx, Defined{ctx, this, StringRef(), binding, stOther, type, + value, size, nullptr}); } // Undefined symbols (excluding those defined relative to non-prevailing @@ -1219,7 +1219,7 @@ void ObjFile::initSectionsAndLocalSyms(bool ignoreComdats) { new (symbols[i]) Undefined(this, name, STB_LOCAL, eSym.st_other, type, /*discardedSecIdx=*/secIdx); else - new (symbols[i]) Defined(this, name, STB_LOCAL, eSym.st_other, type, + new (symbols[i]) Defined(ctx, this, name, STB_LOCAL, eSym.st_other, type, eSym.st_value, eSym.st_size, sec); symbols[i]->partition = 1; symbols[i]->isUsedInRegularObj = true; @@ -1765,11 +1765,12 @@ static void createBitcodeSymbol(Ctx &ctx, Symbol *&sym, } if (objSym.isCommon()) { - sym->resolve(ctx, CommonSymbol{&f, StringRef(), binding, visibility, + sym->resolve(ctx, CommonSymbol{ctx, &f, StringRef(), binding, visibility, STT_OBJECT, objSym.getCommonAlignment(), objSym.getCommonSize()}); } else { - Defined newSym(&f, StringRef(), binding, visibility, type, 0, 0, nullptr); + Defined newSym(ctx, &f, StringRef(), binding, visibility, type, 0, 0, + nullptr); if (objSym.canBeOmittedFromSymbolTable()) newSym.exportDynamic = false; sym->resolve(ctx, newSym); @@ -1849,14 +1850,14 @@ void BinaryFile::parse() { llvm::StringSaver &saver = lld::saver(); - ctx.symtab->addAndCheckDuplicate(ctx, Defined{this, saver.save(s + "_start"), - STB_GLOBAL, STV_DEFAULT, - STT_OBJECT, 0, 0, section}); ctx.symtab->addAndCheckDuplicate( - ctx, Defined{this, saver.save(s + "_end"), STB_GLOBAL, STV_DEFAULT, + ctx, Defined{ctx, this, saver.save(s + "_start"), STB_GLOBAL, STV_DEFAULT, + STT_OBJECT, 0, 0, section}); + ctx.symtab->addAndCheckDuplicate( + ctx, Defined{ctx, this, saver.save(s + "_end"), STB_GLOBAL, STV_DEFAULT, STT_OBJECT, data.size(), 0, section}); ctx.symtab->addAndCheckDuplicate( - ctx, Defined{this, saver.save(s + "_size"), STB_GLOBAL, STV_DEFAULT, + ctx, Defined{ctx, this, saver.save(s + "_size"), STB_GLOBAL, STV_DEFAULT, STT_OBJECT, data.size(), 0, nullptr}); } diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 0560065ffa47..d2088b4c6481 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -227,8 +227,8 @@ void LinkerScript::addSymbol(SymbolAssignment *cmd) { // write expressions like this: `alignment = 16; . = ALIGN(., alignment)`. uint64_t symValue = value.sec ? 0 : value.getValue(); - Defined newSym(createInternalFile(ctx, cmd->location), cmd->name, STB_GLOBAL, - visibility, value.type, symValue, 0, sec); + Defined newSym(ctx, createInternalFile(ctx, cmd->location), cmd->name, + STB_GLOBAL, visibility, value.type, symValue, 0, sec); Symbol *sym = ctx.symtab->insert(cmd->name); sym->mergeProperties(newSym); @@ -244,7 +244,7 @@ void LinkerScript::declareSymbol(SymbolAssignment *cmd) { return; uint8_t visibility = cmd->hidden ? STV_HIDDEN : STV_DEFAULT; - Defined newSym(ctx.internalFile, cmd->name, STB_GLOBAL, visibility, + Defined newSym(ctx, ctx.internalFile, cmd->name, STB_GLOBAL, visibility, STT_NOTYPE, 0, 0, nullptr); // If the symbol is already defined, its order is 0 (with absence indicating diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 0188d658f921..c8dcc276c30a 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -315,10 +315,10 @@ static SmallSet getSymbolsAt(Ctx &ctx, SharedSymbol &ss) { // in .bss and in the case of a canonical plt entry it is in .plt. This function // replaces the existing symbol with a Defined pointing to the appropriate // location. -static void replaceWithDefined(Symbol &sym, SectionBase &sec, uint64_t value, - uint64_t size) { +static void replaceWithDefined(Ctx &ctx, Symbol &sym, SectionBase &sec, + uint64_t value, uint64_t size) { Symbol old = sym; - Defined(sym.file, StringRef(), sym.binding, sym.stOther, sym.type, value, + Defined(ctx, sym.file, StringRef(), sym.binding, sym.stOther, sym.type, value, size, &sec) .overwrite(sym); @@ -398,7 +398,7 @@ template static void addCopyRelSymbol(Ctx &ctx, SharedSymbol &ss) { // dynamic symbol for each one. This causes the copy relocation to correctly // interpose any aliases. for (SharedSymbol *sym : getSymbolsAt(ctx, ss)) - replaceWithDefined(*sym, *sec, 0, sym->size); + replaceWithDefined(ctx, *sym, *sec, 0, sym->size); ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->copyRel, *sec, 0, ss); } @@ -1807,7 +1807,7 @@ void elf::postScanRelocations(Ctx &ctx) { } else { assert(sym.isFunc() && sym.hasFlag(NEEDS_PLT)); if (!sym.isDefined()) { - replaceWithDefined(sym, *ctx.in.plt, + replaceWithDefined(ctx, sym, *ctx.in.plt, ctx.target->pltHeaderSize + ctx.target->pltEntrySize * sym.getPltIdx(ctx), 0); diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h index 010ae9742378..86abebe79f8d 100644 --- a/lld/ELF/Symbols.h +++ b/lld/ELF/Symbols.h @@ -363,8 +363,9 @@ public: // Represents a symbol that is defined in the current output file. class Defined : public Symbol { public: - Defined(InputFile *file, StringRef name, uint8_t binding, uint8_t stOther, - uint8_t type, uint64_t value, uint64_t size, SectionBase *section) + Defined(Ctx &ctx, InputFile *file, StringRef name, uint8_t binding, + uint8_t stOther, uint8_t type, uint64_t value, uint64_t size, + SectionBase *section) : Symbol(DefinedKind, file, name, binding, stOther, type), value(value), size(size), section(section) { exportDynamic = ctx.arg.exportDynamic; @@ -401,7 +402,7 @@ public: // section. (Therefore, the later passes don't see any CommonSymbols.) class CommonSymbol : public Symbol { public: - CommonSymbol(InputFile *file, StringRef name, uint8_t binding, + CommonSymbol(Ctx &ctx, InputFile *file, StringRef name, uint8_t binding, uint8_t stOther, uint8_t type, uint64_t alignment, uint64_t size) : Symbol(CommonKind, file, name, binding, stOther, type), alignment(alignment), size(size) { diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index e18e7a32df86..f50404ed3016 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -276,8 +276,8 @@ InputSection *elf::createInterpSection(Ctx &ctx) { Defined *elf::addSyntheticLocal(Ctx &ctx, StringRef name, uint8_t type, uint64_t value, uint64_t size, InputSectionBase §ion) { - Defined *s = makeDefined(section.file, name, STB_LOCAL, STV_DEFAULT, type, - value, size, §ion); + Defined *s = makeDefined(ctx, section.file, name, STB_LOCAL, STV_DEFAULT, + type, value, size, §ion); if (ctx.in.symTab) ctx.in.symTab->addSymbol(s); @@ -4681,8 +4681,8 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec, if (!s || s->isDefined() || s->isCommon()) return nullptr; - s->resolve(ctx, Defined{ctx.internalFile, StringRef(), STB_GLOBAL, stOther, - STT_NOTYPE, val, + s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, + stOther, STT_NOTYPE, val, /*size=*/0, sec}); s->isUsedInRegularObj = true; return cast(s); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 2cd4478d00cf..c237a5f3793a 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -145,8 +145,8 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec, if (!s || s->isDefined() || s->isCommon()) return nullptr; - s->resolve(ctx, Defined{ctx.internalFile, StringRef(), STB_GLOBAL, stOther, - STT_NOTYPE, val, + s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, + stOther, STT_NOTYPE, val, /*size=*/0, sec}); s->isUsedInRegularObj = true; return cast(s); @@ -158,7 +158,7 @@ void elf::addReservedSymbols(Ctx &ctx) { if (ctx.arg.emachine == EM_MIPS) { auto addAbsolute = [&](StringRef name) { Symbol *sym = - ctx.symtab->addSymbol(Defined{ctx.internalFile, name, STB_GLOBAL, + ctx.symtab->addSymbol(Defined{ctx, ctx.internalFile, name, STB_GLOBAL, STV_HIDDEN, STT_NOTYPE, 0, 0, nullptr}); sym->isUsedInRegularObj = true; return cast(sym); @@ -211,9 +211,9 @@ void elf::addReservedSymbols(Ctx &ctx) { if (ctx.arg.emachine == EM_PPC64) gotOff = 0x8000; - s->resolve(ctx, - Defined{ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN, - STT_NOTYPE, gotOff, /*size=*/0, ctx.out.elfHeader}); + s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, + STV_HIDDEN, STT_NOTYPE, gotOff, /*size=*/0, + ctx.out.elfHeader}); ctx.sym.globalOffsetTable = cast(s); } @@ -534,7 +534,7 @@ template void Writer::addSectionSymbols() { // Set the symbol to be relative to the output section so that its st_value // equals the output section address. Note, there may be a gap between the // start of the output section and isec. - ctx.in.symTab->addSymbol(makeDefined(isec->file, "", STB_LOCAL, + ctx.in.symTab->addSymbol(makeDefined(ctx, isec->file, "", STB_LOCAL, /*stOther=*/0, STT_SECTION, /*value=*/0, /*size=*/0, &osec)); } @@ -1734,7 +1734,7 @@ template void Writer::finalizeSections() { // https://sourceware.org/ml/binutils/2002-03/msg00360.html if (ctx.mainPart->dynamic->parent) { Symbol *s = ctx.symtab->addSymbol(Defined{ - ctx.internalFile, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE, + ctx, ctx.internalFile, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE, /*value=*/0, /*size=*/0, ctx.mainPart->dynamic.get()}); s->isUsedInRegularObj = true; } @@ -1775,7 +1775,7 @@ template void Writer::finalizeSections() { // define _TLS_MODULE_BASE_ relative to the first TLS section. Symbol *s = ctx.symtab->find("_TLS_MODULE_BASE_"); if (s && s->isUndefined()) { - s->resolve(ctx, Defined{ctx.internalFile, StringRef(), STB_GLOBAL, + s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN, STT_TLS, /*value=*/0, 0, /*section=*/nullptr}); ctx.sym.tlsModuleBase = cast(s); -- GitLab From cba5c77a715cfa5892c69b6c646556825932575b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 19 Oct 2024 19:14:21 -0700 Subject: [PATCH 175/511] [VPlan] Mark unreachable code path when retrieving the scalar PH. (NFCI) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a95ac032b1ff..0d35bfb921dc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8866,11 +8866,8 @@ static void addLiveOutsForFirstOrderRecurrences( ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); } else if (ExitUsersToFix.empty()) { ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); - } - if (!ScalarPHVPBB) { - assert(ExitUsersToFix.empty() && - "missed inserting extracts for exiting values"); - return; + } else { + llvm_unreachable("unsupported CFG in VPlan"); } VPBuilder ScalarPHBuilder(ScalarPHVPBB); -- GitLab From 1336e3d0b9a361fbbe2d97f225ef6757d20df51a Mon Sep 17 00:00:00 2001 From: c8ef Date: Sun, 20 Oct 2024 10:46:35 +0800 Subject: [PATCH 176/511] [ConstantFold] Fold `ilogb` and `ilogbf` when the input parameter is a constant value. (#113014) This patch adds support for constant folding for the `ilogb` and `ilogbf` libc functions. --- llvm/lib/Analysis/ConstantFolding.cpp | 10 +- llvm/test/Transforms/InstCombine/ilogb.ll | 203 ++++++++++++++++++++++ 2 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/InstCombine/ilogb.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 74df67a4ff9b..c0104d2bc261 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1677,6 +1677,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { return Name == "fabs" || Name == "fabsf" || Name == "floor" || Name == "floorf" || Name == "fmod" || Name == "fmodf"; + case 'i': + return Name == "ilogb" || Name == "ilogbf"; case 'l': return Name == "log" || Name == "logf" || Name == "logl" || Name == "log2" || Name == "log2f" || Name == "log10" || @@ -2131,7 +2133,8 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, } #endif - if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy()) + if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy() && + !Ty->isIntegerTy()) return nullptr; // Use internal versions of these intrinsics. @@ -2391,6 +2394,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, // TODO: What about hosts that lack a C99 library? return ConstantFoldFP(log10, APF, Ty); break; + case LibFunc_ilogb: + case LibFunc_ilogbf: + if (!APF.isZero() && TLI->has(Func)) + return ConstantInt::get(Ty, ilogb(APF), true); + break; case LibFunc_logb: case LibFunc_logbf: if (!APF.isZero() && TLI->has(Func)) diff --git a/llvm/test/Transforms/InstCombine/ilogb.ll b/llvm/test/Transforms/InstCombine/ilogb.ll new file mode 100644 index 000000000000..e30791fe68e7 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/ilogb.ll @@ -0,0 +1,203 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i32 @ilogbf_const1() { +; CHECK-LABEL: define i32 @ilogbf_const1() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 7.000000e+00) +; CHECK-NEXT: ret i32 2 +; + %r = call i32 @ilogbf(float 7.000000e+00) + ret i32 %r +} + +define i32 @ilogb_const1() { +; CHECK-LABEL: define i32 @ilogb_const1() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double -7.000000e+00) +; CHECK-NEXT: ret i32 2 +; + %r = call i32 @ilogb(double -7.000000e+00) + ret i32 %r +} + +define i32 @ilogbf_const2() { +; CHECK-LABEL: define i32 @ilogbf_const2() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 5.000000e-01) +; CHECK-NEXT: ret i32 -1 +; + %r = call i32 @ilogbf(float 5.000000e-01) + ret i32 %r +} + +define i32 @ilogb_const2() { +; CHECK-LABEL: define i32 @ilogb_const2() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double -5.000000e-01) +; CHECK-NEXT: ret i32 -1 +; + %r = call i32 @ilogb(double -5.000000e-01) + ret i32 %r +} + +define i32 @ilogbf_zero() { +; CHECK-LABEL: define i32 @ilogbf_zero() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0.000000e+00) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0.000000e+00) + ret i32 %r +} + +define i32 @ilogb_zero() { +; CHECK-LABEL: define i32 @ilogb_zero() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0.000000e+00) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0.000000e+00) + ret i32 %r +} + +define i32 @ilogbf_neg_zero() { +; CHECK-LABEL: define i32 @ilogbf_neg_zero() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float -0.000000e+00) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float -0.000000e+00) + ret i32 %r +} + +define i32 @ilogb_neg_zero() { +; CHECK-LABEL: define i32 @ilogb_neg_zero() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double -0.000000e+00) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double -0.000000e+00) + ret i32 %r +} + +define i32 @ilogbf_inf() { +; CHECK-LABEL: define i32 @ilogbf_inf() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0x7FF0000000000000) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0x7FF0000000000000) + ret i32 %r +} + +define i32 @ilogb_inf() { +; CHECK-LABEL: define i32 @ilogb_inf() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0x7FF0000000000000) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0x7FF0000000000000) + ret i32 %r +} + +define i32 @ilogbf_nan() { +; CHECK-LABEL: define i32 @ilogbf_nan() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0x7FF8000000000000) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0x7FF8000000000000) + ret i32 %r +} + +define i32 @ilogb_nan() { +; CHECK-LABEL: define i32 @ilogb_nan() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0x7FF8000000000000) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0x7FF8000000000000) + ret i32 %r +} + +define i32 @ilogbf_zero_readnone() { +; CHECK-LABEL: define i32 @ilogbf_zero_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0.000000e+00) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0.000000e+00) readnone + ret i32 %r +} + +define i32 @ilogb_zero_readnone() { +; CHECK-LABEL: define i32 @ilogb_zero_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0.000000e+00) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0.000000e+00) readnone + ret i32 %r +} + +define i32 @ilogbf_neg_zero_readnone() { +; CHECK-LABEL: define i32 @ilogbf_neg_zero_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float -0.000000e+00) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float -0.000000e+00) readnone + ret i32 %r +} + +define i32 @ilogb_neg_zero_readnone() { +; CHECK-LABEL: define i32 @ilogb_neg_zero_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double -0.000000e+00) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double -0.000000e+00) readnone + ret i32 %r +} + +define i32 @ilogbf_inf_readnone() { +; CHECK-LABEL: define i32 @ilogbf_inf_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0x7FF0000000000000) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0x7FF0000000000000) readnone + ret i32 %r +} + +define i32 @ilogb_inf_readnone() { +; CHECK-LABEL: define i32 @ilogb_inf_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0x7FF0000000000000) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0x7FF0000000000000) readnone + ret i32 %r +} + +define i32 @ilogbf_nan_readnone() { +; CHECK-LABEL: define i32 @ilogbf_nan_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float 0x7FF8000000000000) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float 0x7FF8000000000000) readnone + ret i32 %r +} + +define i32 @ilogb_nan_readnone() { +; CHECK-LABEL: define i32 @ilogb_nan_readnone() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double 0x7FF8000000000000) #[[ATTR0]] +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double 0x7FF8000000000000) readnone + ret i32 %r +} + +define i32 @ilogbf_poison() { +; CHECK-LABEL: define i32 @ilogbf_poison() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogbf(float poison) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogbf(float poison) + ret i32 %r +} + +define i32 @ilogb_poison() { +; CHECK-LABEL: define i32 @ilogb_poison() { +; CHECK-NEXT: [[R:%.*]] = call i32 @ilogb(double poison) +; CHECK-NEXT: ret i32 [[R]] +; + %r = call i32 @ilogb(double poison) + ret i32 %r +} + +declare i32 @ilogbf(float) +declare i32 @ilogb(double) -- GitLab From 4a011ac84fa16f7eed34c309bdac5591d9553da7 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sun, 20 Oct 2024 12:30:35 +0900 Subject: [PATCH 177/511] [Coverage] Introduce "partial fold" on BranchRegion (#112694) Currently both True/False counts were folded. It lost the information, "It is True or False before folding." It prevented recalling branch counts in merging template instantiations. In `llvm-cov`, a folded branch is shown as: - `[True: n, Folded]` - `[Folded, False n]` In the case If `n` is zero, a branch is reported as "uncovered". This is distinguished from "folded" branch. When folded branches are merged, `Folded` may be dissolved. In the coverage map, either `Counter` is `Zero`. Currently both were `Zero`. Since "partial fold" has been introduced, either case in `switch` is omitted as `Folded`. Each `case:` in `switch` is reported as `[True: n, Folded]`, since `False` count doesn't show meaningful value. When `switch` doesn't have `default:`, `switch (Cond)` is reported as `[Folded, False: n]`, since `True` count was just the sum of `case`(s). `switch` with `default` can be considered as "the statement that doesn't have any `False`(s)". --- clang/lib/CodeGen/CoverageMappingGen.cpp | 46 +++++-------- .../CoverageMapping/branch-constfolded.cpp | 40 ++++++------ clang/test/CoverageMapping/if.cpp | 4 +- clang/test/CoverageMapping/macro-expansion.c | 10 +-- .../test/CoverageMapping/mcdc-scratch-space.c | 4 +- .../CoverageMapping/mcdc-system-headers.cpp | 4 +- clang/test/CoverageMapping/switch.cpp | 64 +++++++++---------- clang/test/CoverageMapping/switchmacro.c | 4 +- .../ProfileData/Coverage/CoverageMapping.h | 15 +++-- .../ProfileData/Coverage/CoverageMapping.cpp | 2 +- .../test/tools/llvm-cov/branch-c-general.test | 14 ++-- llvm/tools/llvm-cov/CoverageExporterJson.cpp | 2 +- llvm/tools/llvm-cov/CoverageExporterLcov.cpp | 2 +- llvm/tools/llvm-cov/CoverageSummaryInfo.cpp | 24 +++---- .../tools/llvm-cov/SourceCoverageViewHTML.cpp | 51 +++++++++------ .../tools/llvm-cov/SourceCoverageViewText.cpp | 47 ++++++++------ 16 files changed, 169 insertions(+), 164 deletions(-) diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 577a0f571e16..0a63c50d44f4 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -1098,12 +1098,6 @@ struct CounterCoverageMappingBuilder return ExitCount; } - /// Determine whether the given condition can be constant folded. - bool ConditionFoldsToBool(const Expr *Cond) { - Expr::EvalResult Result; - return (Cond->EvaluateAsInt(Result, CVM.getCodeGenModule().getContext())); - } - /// Create a Branch Region around an instrumentable condition for coverage /// and add it to the function's SourceRegions. A branch region tracks a /// "True" counter and a "False" counter for boolean expressions that @@ -1133,13 +1127,15 @@ struct CounterCoverageMappingBuilder // Alternatively, we can prevent any optimization done via // constant-folding by ensuring that ConstantFoldsToSimpleInteger() in // CodeGenFunction.c always returns false, but that is very heavy-handed. - if (ConditionFoldsToBool(C)) - popRegions(pushRegion(Counter::getZero(), getStart(C), getEnd(C), - Counter::getZero(), BranchParams)); - else - // Otherwise, create a region with the True counter and False counter. - popRegions(pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt, - BranchParams)); + Expr::EvalResult Result; + if (C->EvaluateAsInt(Result, CVM.getCodeGenModule().getContext())) { + if (Result.Val.getInt().getBoolValue()) + FalseCnt = Counter::getZero(); + else + TrueCnt = Counter::getZero(); + } + popRegions( + pushRegion(TrueCnt, getStart(C), getEnd(C), FalseCnt, BranchParams)); } } @@ -1153,12 +1149,12 @@ struct CounterCoverageMappingBuilder /// Create a Branch Region around a SwitchCase for code coverage /// and add it to the function's SourceRegions. - void createSwitchCaseRegion(const SwitchCase *SC, Counter TrueCnt, - Counter FalseCnt) { + void createSwitchCaseRegion(const SwitchCase *SC, Counter TrueCnt) { // Push region onto RegionStack but immediately pop it (which adds it to // the function's SourceRegions) because it doesn't apply to any other // source other than the SwitchCase. - popRegions(pushRegion(TrueCnt, getStart(SC), SC->getColonLoc(), FalseCnt)); + popRegions(pushRegion(TrueCnt, getStart(SC), SC->getColonLoc(), + Counter::getZero())); } /// Check whether a region with bounds \c StartLoc and \c EndLoc @@ -1870,24 +1866,16 @@ struct CounterCoverageMappingBuilder const SwitchCase *Case = S->getSwitchCaseList(); for (; Case; Case = Case->getNextSwitchCase()) { HasDefaultCase = HasDefaultCase || isa(Case); - CaseCountSum = - addCounters(CaseCountSum, getRegionCounter(Case), /*Simplify=*/false); - createSwitchCaseRegion( - Case, getRegionCounter(Case), - subtractCounters(ParentCount, getRegionCounter(Case))); + auto CaseCount = getRegionCounter(Case); + CaseCountSum = addCounters(CaseCountSum, CaseCount, /*Simplify=*/false); + createSwitchCaseRegion(Case, CaseCount); } - // Simplify is skipped while building the counters above: it can get really - // slow on top of switches with thousands of cases. Instead, trigger - // simplification by adding zero to the last counter. - CaseCountSum = addCounters(CaseCountSum, Counter::getZero()); - // If no explicit default case exists, create a branch region to represent // the hidden branch, which will be added later by the CodeGen. This region // will be associated with the switch statement's condition. if (!HasDefaultCase) { - Counter DefaultTrue = subtractCounters(ParentCount, CaseCountSum); - Counter DefaultFalse = subtractCounters(ParentCount, DefaultTrue); - createBranchRegion(S->getCond(), DefaultTrue, DefaultFalse); + Counter DefaultCount = subtractCounters(ParentCount, CaseCountSum); + createBranchRegion(S->getCond(), Counter::getZero(), DefaultCount); } } diff --git a/clang/test/CoverageMapping/branch-constfolded.cpp b/clang/test/CoverageMapping/branch-constfolded.cpp index 1e7e32808e83..a2ac1c1eacd2 100644 --- a/clang/test/CoverageMapping/branch-constfolded.cpp +++ b/clang/test/CoverageMapping/branch-constfolded.cpp @@ -5,94 +5,94 @@ // CHECK-LABEL: _Z6fand_0b: bool fand_0(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:3, C:2 - return false && a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0 + return false && a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, (#0 - #1) } // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = #2, (#1 - #2) // CHECK-LABEL: _Z6fand_1b: bool fand_1(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:3, C:2 return a && true; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #1, (#0 - #1) -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = #2, 0 // CHECK-LABEL: _Z6fand_2bb: bool fand_2(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:4, C:3 - return false && a && b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0 + return false && a && b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, (#0 - #3) } // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = #4, (#3 - #4) // CHECK: Branch,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = #2, (#1 - #2) // CHECK-LABEL: _Z6fand_3bb: bool fand_3(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:4, C:3 return a && true && b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #3, (#0 - #3) -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = #4, 0 // CHECK: Branch,File 0, [[@LINE-2]]:23 -> [[@LINE-2]]:24 = #2, (#1 - #2) // CHECK-LABEL: _Z6fand_4bb: bool fand_4(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:4, C:3 return a && b && false; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #3, (#0 - #3) } // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:16 = #4, (#3 - #4) - // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:25 = 0, 0 + // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:25 = 0, (#1 - #2) // CHECK-LABEL: _Z6fand_5b: bool fand_5(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:23 = M:3, C:2 - return false && true; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0 -} // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:23 = 0, 0 + return false && true; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, (#0 - #1) +} // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:23 = #2, 0 // CHECK-LABEL: _Z6fand_6b: bool fand_6(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:3, C:2 - return true && a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0 + return true && a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = #1, 0 } // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = #2, (#1 - #2) // CHECK-LABEL: _Z6fand_7b: bool fand_7(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:3, C:2 return a && false; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = #1, (#0 - #1) -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, (#1 - #2) // CHECK-LABEL: _Z5for_0b: bool for_0(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:3, C:2 - return true || a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0 + return true || a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = (#0 - #1), 0 } // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = (#1 - #2), #2 // CHECK-LABEL: _Z5for_1b: bool for_1(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:3, C:2 return a || false; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #1), #1 -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, #2 // CHECK-LABEL: _Z5for_2bb: bool for_2(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:4, C:3 - return true || a || b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0 + return true || a || b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = (#0 - #3), 0 } // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:19 = (#3 - #4), #4 // CHECK: Branch,File 0, [[@LINE-2]]:23 -> [[@LINE-2]]:24 = (#1 - #2), #2 // CHECK-LABEL: _Z5for_3bb: bool for_3(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:25 = M:4, C:3 return a || false || b; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #3), #3 -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:20 = 0, #4 // CHECK: Branch,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = (#1 - #2), #2 // CHECK-LABEL: _Z5for_4bb: bool for_4(bool a, bool b) {// MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:24 = M:4, C:3 return a || b || true; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #3), #3 } // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:16 = (#3 - #4), #4 - // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:24 = 0, 0 + // CHECK: Branch,File 0, [[@LINE-2]]:20 -> [[@LINE-2]]:24 = (#1 - #2), 0 // CHECK-LABEL: _Z5for_5b: bool for_5(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:23 = M:3, C:2 - return true || false; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = 0, 0 -} // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:23 = 0, 0 + return true || false; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:14 = (#0 - #1), 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:18 -> [[@LINE-1]]:23 = 0, #2 // CHECK-LABEL: _Z5for_6b: bool for_6(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:20 = M:3, C:2 - return false || a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, 0 + return false || a; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:15 = 0, #1 } // CHECK: Branch,File 0, [[@LINE-1]]:19 -> [[@LINE-1]]:20 = (#1 - #2), #2 // CHECK-LABEL: _Z5for_7b: bool for_7(bool a) { // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:19 = M:3, C:2 return a || true; // CHECK: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = (#0 - #1), #1 -} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0 +} // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = (#1 - #2), 0 // CHECK-LABEL: _Z5for_8b: bool for_8(bool a) { // MCDC: Decision,File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:20 = M:3, C:2 - // CHECK: Branch,File 0, [[@LINE+2]]:7 -> [[@LINE+2]]:11 = 0, 0 - // CHECK: Branch,File 0, [[@LINE+1]]:15 -> [[@LINE+1]]:20 = 0, 0 + // CHECK: Branch,File 0, [[@LINE+2]]:7 -> [[@LINE+2]]:11 = #2, 0 + // CHECK: Branch,File 0, [[@LINE+1]]:15 -> [[@LINE+1]]:20 = 0, (#2 - #3) if (true && false) return true; else diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp index 445cdfc20e2a..b6fd525e930f 100644 --- a/clang/test/CoverageMapping/if.cpp +++ b/clang/test/CoverageMapping/if.cpp @@ -14,7 +14,7 @@ struct S { // CHECK-LABEL: _Z3foov: // CHECK-NEXT: [[@LINE+3]]:12 -> [[@LINE+8]]:2 = #0 // CHECK-NEXT: [[@LINE+3]]:15 -> [[@LINE+3]]:19 = #0 - // CHECK-NEXT: Branch,File 0, [[@LINE+2]]:15 -> [[@LINE+2]]:19 = 0, 0 + // CHECK-NEXT: Branch,File 0, [[@LINE+2]]:15 -> [[@LINE+2]]:19 = #2, 0 void foo() { // CHECK-NEXT: Gap,File 0, [[@LINE+1]]:21 -> [[@LINE+1]]:22 = #2 if (int j = true ? nop() // CHECK-NEXT: [[@LINE]]:22 -> [[@LINE]]:27 = #2 : nop(); // CHECK-NEXT: [[@LINE]]:22 -> [[@LINE]]:27 = (#0 - #2) @@ -168,7 +168,7 @@ int main() { // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = // GH-45481 S s; s.the_prop = 0? 1 : 2; // CHECK-NEXT: File 0, [[@LINE]]:16 -> [[@LINE]]:17 = #0 - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:17 = 0, 0 + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:17 = 0, (#0 - #7) // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:18 -> [[@LINE-2]]:19 = #7 // CHECK-NEXT: File 0, [[@LINE-3]]:19 -> [[@LINE-3]]:20 = #7 // CHECK-NEXT: File 0, [[@LINE-4]]:23 -> [[@LINE-4]]:24 = (#0 - #7) diff --git a/clang/test/CoverageMapping/macro-expansion.c b/clang/test/CoverageMapping/macro-expansion.c index ad71fb15eda4..4cd2c9343719 100644 --- a/clang/test/CoverageMapping/macro-expansion.c +++ b/clang/test/CoverageMapping/macro-expansion.c @@ -4,29 +4,29 @@ // CHECK: File 1, [[@LINE+7]]:12 -> [[@LINE+7]]:38 = #0 // CHECK-NEXT: File 1, [[@LINE+6]]:15 -> [[@LINE+6]]:28 = (#0 + #2) // CHECK-NEXT: File 1, [[@LINE+5]]:21 -> [[@LINE+5]]:22 = (#0 + #2) -// CHECK: Branch,File 1, [[@LINE+4]]:21 -> [[@LINE+4]]:22 = 0, 0 +// CHECK: Branch,File 1, [[@LINE+4]]:21 -> [[@LINE+4]]:22 = 0, ((#0 + #2) - #3) // CHECK-NEXT: File 1, [[@LINE+3]]:24 -> [[@LINE+3]]:26 = #3 // CHECK-NEXT: File 1, [[@LINE+2]]:36 -> [[@LINE+2]]:37 = (#0 + #2) -// CHECK-NEXT: Branch,File 1, [[@LINE+1]]:36 -> [[@LINE+1]]:37 = 0, 0 +// CHECK-NEXT: Branch,File 1, [[@LINE+1]]:36 -> [[@LINE+1]]:37 = 0, #0 #define M1 do { if (0) {} } while (0) // CHECK-NEXT: File 2, [[@LINE+12]]:15 -> [[@LINE+12]]:41 = #0 // CHECK-NEXT: File 2, [[@LINE+11]]:18 -> [[@LINE+11]]:31 = (#0 + #4) // CHECK-NEXT: File 2, [[@LINE+10]]:24 -> [[@LINE+10]]:25 = (#0 + #4) // CHECK: File 2, [[@LINE+9]]:27 -> [[@LINE+9]]:29 = #5 // CHECK-NEXT: File 2, [[@LINE+8]]:39 -> [[@LINE+8]]:40 = (#0 + #4) -// CHECK-NEXT: Branch,File 2, [[@LINE+7]]:39 -> [[@LINE+7]]:40 = 0, 0 +// CHECK-NEXT: Branch,File 2, [[@LINE+7]]:39 -> [[@LINE+7]]:40 = 0, #0 // CHECK-NEXT: File 3, [[@LINE+6]]:15 -> [[@LINE+6]]:41 = #0 // CHECK-NEXT: File 3, [[@LINE+5]]:18 -> [[@LINE+5]]:31 = (#0 + #6) // CHECK-NEXT: File 3, [[@LINE+4]]:24 -> [[@LINE+4]]:25 = (#0 + #6) // CHECK: File 3, [[@LINE+3]]:27 -> [[@LINE+3]]:29 = #7 // CHECK-NEXT: File 3, [[@LINE+2]]:39 -> [[@LINE+2]]:40 = (#0 + #6) -// CHECK-NEXT: Branch,File 3, [[@LINE+1]]:39 -> [[@LINE+1]]:40 = 0, 0 +// CHECK-NEXT: Branch,File 3, [[@LINE+1]]:39 -> [[@LINE+1]]:40 = 0, #0 #define M2(x) do { if (x) {} } while (0) // CHECK-NEXT: File 4, [[@LINE+5]]:15 -> [[@LINE+5]]:38 = #0 // CHECK-NEXT: File 4, [[@LINE+4]]:18 -> [[@LINE+4]]:28 = (#0 + #8) // CHECK-NEXT: Expansion,File 4, [[@LINE+3]]:20 -> [[@LINE+3]]:22 = (#0 + #8) // CHECK-NEXT: File 4, [[@LINE+2]]:36 -> [[@LINE+2]]:37 = (#0 + #8) -// CHECK-NEXT: Branch,File 4, [[@LINE+1]]:36 -> [[@LINE+1]]:37 = 0, 0 +// CHECK-NEXT: Branch,File 4, [[@LINE+1]]:36 -> [[@LINE+1]]:37 = 0, #0 #define M3(x) do { M2(x); } while (0) // CHECK-NEXT: File 5, [[@LINE+4]]:15 -> [[@LINE+4]]:27 = #0 // CHECK-NEXT: File 5, [[@LINE+3]]:16 -> [[@LINE+3]]:19 = #0 diff --git a/clang/test/CoverageMapping/mcdc-scratch-space.c b/clang/test/CoverageMapping/mcdc-scratch-space.c index a263e9b688fa..60e456948a51 100644 --- a/clang/test/CoverageMapping/mcdc-scratch-space.c +++ b/clang/test/CoverageMapping/mcdc-scratch-space.c @@ -3,7 +3,7 @@ // CHECK: builtin_macro0: int builtin_macro0(int a) { // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:15 = M:3, C:2 - return (__LINE__ // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:11 = 0, 0 [1,2,0] + return (__LINE__ // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:11 = #1, 0 [1,2,0] && a); // CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:15 = #2, (#1 - #2) [2,0,0] } @@ -11,7 +11,7 @@ int builtin_macro0(int a) { int builtin_macro1(int a) { // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:22 = M:3, C:2 return (a // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = (#0 - #1), #1 [1,0,2] - || __LINE__); // CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:14 = 0, 0 [2,0,0] + || __LINE__); // CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:14 = (#1 - #2), 0 [2,0,0] } #define PRE(x) pre_##x diff --git a/clang/test/CoverageMapping/mcdc-system-headers.cpp b/clang/test/CoverageMapping/mcdc-system-headers.cpp index ae26ed5fe469..cb1c8743c36e 100644 --- a/clang/test/CoverageMapping/mcdc-system-headers.cpp +++ b/clang/test/CoverageMapping/mcdc-system-headers.cpp @@ -17,10 +17,10 @@ int func0(int a) { // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+3]]:21 = M:3, C:2 // W_SYS: Expansion,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:16 = #0 (Expanded file = 1) - // X_SYS: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:11 = 0, 0 [1,2,0] + // X_SYS: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:11 = #1, 0 [1,2,0] return (CONST && a); // CHECK: Branch,File 0, [[@LINE-1]]:20 -> [[@LINE-1]]:21 = #2, (#1 - #2) [2,0,0] - // W_SYS: Branch,File 1, [[@LINE-16]]:15 -> [[@LINE-16]]:17 = 0, 0 [1,2,0] + // W_SYS: Branch,File 1, [[@LINE-16]]:15 -> [[@LINE-16]]:17 = #1, 0 [1,2,0] } // CHECK: _Z5func1ii: diff --git a/clang/test/CoverageMapping/switch.cpp b/clang/test/CoverageMapping/switch.cpp index b47c0e800995..a1fee644faaf 100644 --- a/clang/test/CoverageMapping/switch.cpp +++ b/clang/test/CoverageMapping/switch.cpp @@ -2,13 +2,13 @@ // CHECK: foo void foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 = #0 - switch(i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = ((#0 - #2) - #3), (#2 + #3) + switch(i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = 0, ((#0 - #2) - #3) // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+5]]:10 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:11 = #2 - return; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, (#0 - #2) + return; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, 0 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:12 -> [[@LINE+1]]:3 = 0 case 2: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #3 - break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, (#0 - #3) + break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, 0 } // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE+1]]:3 = #1 int x = 0; // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:2 = #1 @@ -18,24 +18,24 @@ int nop() { return 0; } // CHECK: bar void bar(int i) { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+21]]:2 = #0 - switch (i) // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = #0, 0 + switch (i) // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = 0, #0 ; // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:6 = 0 switch (i) { // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+17]]:2 = #1 - } // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = #1, 0 + } // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = 0, #1 switch (i) // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+14]]:2 = #2 - nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = #2, 0 + nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = 0, #2 // CHECK-NEXT: File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:10 = 0 switch (i) // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+11]]:2 = #3 - case 1: // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = (#3 - #5), #5 + case 1: // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:11 -> [[@LINE-1]]:12 = 0, (#3 - #5) // CHECK-NEXT: File 0, [[@LINE-1]]:3 -> [[@LINE+1]]:10 = #5 - nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-2]]:3 -> [[@LINE-2]]:9 = #5, (#3 - #5) + nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-2]]:3 -> [[@LINE-2]]:9 = #5, 0 // CHECK-NEXT: File 0, [[@LINE+1]]:3 -> [[@LINE+7]]:2 = #4 - switch (i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = (#4 - #7), #7 + switch (i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = 0, (#4 - #7) nop(); // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+2]]:10 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #7 - nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #7, (#4 - #7) + nop(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #7, 0 } nop(); // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:2 = #6 } @@ -44,7 +44,7 @@ void bar(int i) { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+21]]:2 = #0 void baz() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+5]]:2 = #0 switch (int i = true ? nop() // CHECK: [[@LINE]]:26 -> [[@LINE]]:31 = #2 : nop(); // CHECK-NEXT: [[@LINE]]:26 -> [[@LINE]]:31 = (#0 - #2) - i) {} // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = #0, 0 + i) {} // CHECK-NEXT: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = 0, #0 nop(); // CHECK-NEXT: [[@LINE]]:3 -> [[@LINE+1]]:2 = #1 } @@ -53,35 +53,35 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+39]]:2 = #0 int i = 0; switch(i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+8]]:10 = 0 case 0: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = #2 - i = 1; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, (#0 - #2) + i = 1; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, 0 break; // CHECK-NEXT: Gap,File 0, [[@LINE]]:11 -> [[@LINE+1]]:3 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = #3 - i = 2; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, (#0 - #3) + i = 2; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, 0 break; // CHECK-NEXT: Gap,File 0, [[@LINE]]:11 -> [[@LINE+1]]:3 = 0 default: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #4 - break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #4, (#0 - #4) + break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #4, 0 } // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE+1]]:3 = #1 switch(i) { // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+27]]:2 = #1 case 0: // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+7]]:10 = 0 i = 1; // CHECK-NEXT: File 0, [[@LINE-1]]:3 -> [[@LINE+1]]:10 = #6 - break; // CHECK-NEXT: Branch,File 0, [[@LINE-2]]:3 -> [[@LINE-2]]:9 = #6, (#1 - #6) + break; // CHECK-NEXT: Branch,File 0, [[@LINE-2]]:3 -> [[@LINE-2]]:9 = #6, 0 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:11 -> [[@LINE+1]]:3 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = #7 - i = 2; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #7, (#1 - #7) + i = 2; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #7, 0 default: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = (#7 + #8) - break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #8, (#1 - #8) + break; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #8, 0 } // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE+2]]:3 = #5 // CHECK-NEXT: File 0, [[@LINE+1]]:3 -> [[@LINE+17]]:2 = #5 - switch(i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = ((((#5 - #10) - #11) - #12) - #13), (((#10 + #11) + #12) + #13) + switch(i) { // CHECK-NEXT: Branch,File 0, [[@LINE]]:10 -> [[@LINE]]:11 = 0, ((((#5 - #10) - #11) - #12) - #13) // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+8]]:11 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+7]]:11 = #10 - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #10, (#5 - #10) + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #10, 0 case 2: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+5]]:11 = (#10 + #11) - i = 11; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #11, (#5 - #11) + i = 11; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #11, 0 case 3: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:11 = ((#10 + #11) + #12) - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #12, (#5 - #12) + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #12, 0 case 4: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:11 = (((#10 + #11) + #12) + #13) - i = 99; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #13, (#5 - #13) + i = 99; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #13, 0 } foo(1); // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:11 = #9 @@ -95,10 +95,10 @@ int pr44011(int i) { // CHECK-NEXT: File 0, [[@LINE]]:20 -> {{.*}}:2 = #0 switch (i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> [[@LINE+6]]:13 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 = #2 - return 0; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, (#0 - #2) + return 0; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, 0 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+1]]:3 = 0 default: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 = #3 - return 1; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #3, (#0 - #3) + return 1; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #3, 0 } } // A region for counter #1 is missing due to the missing return. @@ -106,17 +106,17 @@ int pr44011(int i) { // CHECK-NEXT: File 0, [[@LINE]]:20 -> {{.*}}:2 = #0 // FIXME: End location for "case 1" shouldn't point at the end of the switch. // CHECK: fallthrough int fallthrough(int i) { // CHECK-NEXT: File 0, [[@LINE]]:24 -> [[@LINE+14]]:2 = #0 - // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:11 = ((((#0 - #2) - #3) - #4) - #5), (((#2 + #3) + #4) + #5) + // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:10 -> [[@LINE+1]]:11 = 0, ((((#0 - #2) - #3) - #4) - #5) switch(i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+10]]:10 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+9]]:10 = #2 - i = 23; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, (#0 - #2) + i = 23; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #2, 0 case 2: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = (#2 + #3) - i = 11; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, (#0 - #3) + i = 11; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, 0 break; // CHECK-NEXT: Gap,File 0, [[@LINE]]:11 -> [[@LINE+1]]:3 = 0 case 3: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+4]]:10 = #4 - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, (#0 - #4) + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, 0 case 4: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = (#4 + #5) - i = 99; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #5, (#0 - #5) + i = 99; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #5, 0 break; } } @@ -126,12 +126,12 @@ void abort(void) __attribute((noreturn)); int noret(int x) { // CHECK-NEXT: File 0, [[@LINE]]:18 -> [[@LINE+11]]:2 switch (x) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> [[@LINE+8]]:14 = 0 default: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:12 - abort(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #2, (#0 - #2) + abort(); // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #2, 0 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:3 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 - return 5; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, (#0 - #3) + return 5; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #3, 0 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+1]]:3 = 0 case 2: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:14 - return 10; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, (#0 - #4) + return 10; // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, 0 } } diff --git a/clang/test/CoverageMapping/switchmacro.c b/clang/test/CoverageMapping/switchmacro.c index 4c98cc7d9403..0696e7490cdf 100644 --- a/clang/test/CoverageMapping/switchmacro.c +++ b/clang/test/CoverageMapping/switchmacro.c @@ -6,7 +6,7 @@ int foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:16 -> {{[0-9]+}}:2 = #0 switch (i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> {{[0-9]+}}:11 = 0 default: // CHECK-NEXT: File 0, [[@LINE]]:3 -> {{[0-9]+}}:11 = #2 - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #2, (#0 - #2) + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:10 = #2, 0 if (i == 1) // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = #2 // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:9 -> [[@LINE-1]]:15 = #3, (#2 - #3) return 0; // CHECK: File 0, [[@LINE]]:7 -> [[@LINE]]:15 = #3 @@ -15,7 +15,7 @@ int foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:16 -> {{[0-9]+}}:2 = #0 // CHECK-NEXT: File 0, [[@LINE+1]]:8 -> {{[0-9]+}}:11 = (#2 - #3) FOO(1); case 0: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:13 = ((#2 + #4) - #3) - // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, (#0 - #4) + // CHECK-NEXT: Branch,File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:9 = #4, 0 return 2; // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> [[@LINE+4]]:3 = 0 // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:3 -> [[@LINE+2]]:6 = 0 diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h index fa07b3a9e8b1..e631e3899fd4 100644 --- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -358,20 +358,21 @@ struct CounterMappingRegion { struct CountedRegion : public CounterMappingRegion { uint64_t ExecutionCount; uint64_t FalseExecutionCount; - bool Folded; + bool TrueFolded; + bool FalseFolded; bool HasSingleByteCoverage; CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount, bool HasSingleByteCoverage) : CounterMappingRegion(R), ExecutionCount(ExecutionCount), - FalseExecutionCount(0), Folded(false), + FalseExecutionCount(0), TrueFolded(false), FalseFolded(true), HasSingleByteCoverage(HasSingleByteCoverage) {} CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount, uint64_t FalseExecutionCount, bool HasSingleByteCoverage) : CounterMappingRegion(R), ExecutionCount(ExecutionCount), - FalseExecutionCount(FalseExecutionCount), Folded(false), - HasSingleByteCoverage(HasSingleByteCoverage) {} + FalseExecutionCount(FalseExecutionCount), TrueFolded(false), + FalseFolded(false), HasSingleByteCoverage(HasSingleByteCoverage) {} }; /// MCDC Record grouping all information together. @@ -719,10 +720,10 @@ struct FunctionRecord { Region.Kind == CounterMappingRegion::MCDCBranchRegion) { CountedBranchRegions.emplace_back(Region, Count, FalseCount, HasSingleByteCoverage); - // If both counters are hard-coded to zero, then this region represents a + // If either counter is hard-coded to zero, then this region represents a // constant-folded branch. - if (Region.Count.isZero() && Region.FalseCount.isZero()) - CountedBranchRegions.back().Folded = true; + CountedBranchRegions.back().TrueFolded = Region.Count.isZero(); + CountedBranchRegions.back().FalseFolded = Region.FalseCount.isZero(); return; } if (CountedRegions.empty()) diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index c713371da81e..119e09187b90 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -503,7 +503,7 @@ public: const auto &BranchParams = B->getBranchParams(); PosToID[I] = BranchParams.ID; CondLoc[I] = B->startLoc(); - Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero()); + Folded[I++] = (B->Count.isZero() || B->FalseCount.isZero()); } // Using Profile Bitmap from runtime, mark the executed test vectors. diff --git a/llvm/test/tools/llvm-cov/branch-c-general.test b/llvm/test/tools/llvm-cov/branch-c-general.test index 9b5889babde3..2fa99dfe6153 100644 --- a/llvm/test/tools/llvm-cov/branch-c-general.test +++ b/llvm/test/tools/llvm-cov/branch-c-general.test @@ -47,7 +47,7 @@ // CHECK: Branch (103:9): [True: 9, False: 1] // CHECK: switches() -// CHECK: Branch (113:3): [True: 1, False: 0] +// CHECK: Branch (113:3): [True: 1, Folded] // CHECK: Branch (117:63): [True: 15, False: 0] // CHECK: Branch (119:5): [True: 1, False: 14] // CHECK: Branch (120:11): [True: 0, False: 1] @@ -57,7 +57,7 @@ // CHECK: Branch (126:11): [True: 3, False: 0] // CHECK: Branch (128:5): [True: 4, False: 11] // CHECK: Branch (129:11): [True: 4, False: 0] -// CHECK: Branch (131:7): [True: 4, False: 0] +// CHECK: Branch (131:7): [True: 4, Folded] // CHECK: Branch (132:13): [True: 4, False: 0] // CHECK: Branch (136:5): [True: 5, False: 10] // CHECK: Branch (137:11): [True: 1, False: 4] @@ -114,13 +114,13 @@ -// REPORT: Name Regions Miss Cover Lines Miss Cover Branches Miss Cover +// REPORT: Name Regions Miss Cover Lines Miss Cover Branches Miss Cover // REPORT-NEXT: --- // REPORT-NEXT: simple_loops 8 0 100.00% 9 0 100.00% 6 0 100.00% // REPORT-NEXT: conditionals 24 0 100.00% 15 0 100.00% 16 2 87.50% // REPORT-NEXT: early_exits 20 4 80.00% 25 2 92.00% 16 6 62.50% // REPORT-NEXT: jumps 39 12 69.23% 48 2 95.83% 26 9 65.38% -// REPORT-NEXT: switches 28 5 82.14% 38 4 89.47% 30 9 70.00% +// REPORT-NEXT: switches 28 5 82.14% 38 4 89.47% 28 7 75.00% // REPORT-NEXT: big_switch 25 1 96.00% 32 0 100.00% 30 6 80.00% // REPORT-NEXT: boolean_operators 16 0 100.00% 13 0 100.00% 22 2 90.91% // REPORT-NEXT: boolop_loops 19 0 100.00% 14 0 100.00% 16 2 87.50% @@ -129,12 +129,12 @@ // REPORT-NEXT: main 1 0 100.00% 16 0 100.00% 0 0 0.00% // REPORT-NEXT: c-general.c:static_func 4 0 100.00% 4 0 100.00% 2 0 100.00% // REPORT-NEXT: --- -// REPORT-NEXT: TOTAL 197 24 87.82% 234 8 96.58% 174 38 78.16% +// REPORT-NEXT: TOTAL 197 24 87.82% 234 8 96.58% 172 36 79.07% // Test file-level report. // RUN: llvm-profdata merge %S/Inputs/branch-c-general.proftext -o %t.profdata // RUN: llvm-cov report %S/Inputs/branch-c-general.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/branch-c-general.c | FileCheck %s -check-prefix=FILEREPORT -// FILEREPORT: TOTAL{{.*}}174 38 78.16% +// FILEREPORT: TOTAL{{.*}}172 36 79.07% // Test color True/False output. // RUN: llvm-cov show --use-color --show-branches=count %S/Inputs/branch-c-general.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs %S/Inputs/branch-c-general.c | FileCheck %s -check-prefix=USECOLOR @@ -161,6 +161,6 @@ // HTML-INDEX: // HTML-INDEX: 87.82% (173/197) // HTML-INDEX: -// HTML-INDEX: 78.16% (136/174) +// HTML-INDEX: 79.07% (136/172) // HTML-INDEX: // HTML-INDEX: Totals diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp index 9a8c7c94f061..4088c1b053aa 100644 --- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -125,7 +125,7 @@ json::Array renderRegions(ArrayRef Regions) { json::Array renderBranchRegions(ArrayRef Regions) { json::Array RegionArray; for (const auto &Region : Regions) - if (!Region.Folded) + if (!Region.TrueFolded || !Region.FalseFolded) RegionArray.push_back(renderBranch(Region)); return RegionArray; } diff --git a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp index ae8f556edb31..d6b9367ae4c5 100644 --- a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp @@ -139,7 +139,7 @@ void renderBranchExecutionCounts(raw_ostream &OS, unsigned BranchIndex = 0; while (NextBranch != EndBranch && CurrentLine == NextBranch->LineStart) { - if (!NextBranch->Folded) { + if (!NextBranch->TrueFolded || !NextBranch->FalseFolded) { unsigned BC1 = NextBranch->ExecutionCount; unsigned BC2 = NextBranch->FalseExecutionCount; bool BranchNotExecuted = (BC1 == 0 && BC2 == 0); diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp index 4f150020ee38..58e7918d3927 100644 --- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp +++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp @@ -19,18 +19,18 @@ using namespace coverage; static void sumBranches(size_t &NumBranches, size_t &CoveredBranches, const ArrayRef &Branches) { for (const auto &BR : Branches) { - // Skip folded branches. - if (BR.Folded) - continue; - - // "True" Condition Branches. - ++NumBranches; - if (BR.ExecutionCount > 0) - ++CoveredBranches; - // "False" Condition Branches. - ++NumBranches; - if (BR.FalseExecutionCount > 0) - ++CoveredBranches; + if (!BR.TrueFolded) { + // "True" Condition Branches. + ++NumBranches; + if (BR.ExecutionCount > 0) + ++CoveredBranches; + } + if (!BR.FalseFolded) { + // "False" Condition Branches. + ++NumBranches; + if (BR.FalseExecutionCount > 0) + ++CoveredBranches; + } } } diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp index 6f4d327679d6..7421763dd7a4 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp @@ -1128,36 +1128,45 @@ void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV, "line-number") + "): ["; - if (R.Folded) { + if (R.TrueFolded && R.FalseFolded) { OS << "Folded - Ignored]\n"; continue; } // Display TrueCount or TruePercent. - std::string TrueColor = R.ExecutionCount ? "None" : "red branch"; + std::string TrueColor = + (R.TrueFolded || R.ExecutionCount ? "None" : "red branch"); std::string TrueCovClass = - (R.ExecutionCount > 0) ? "covered-line" : "uncovered-line"; - - OS << tag("span", "True", TrueColor); - OS << ": "; - if (getOptions().ShowBranchCounts) - OS << tag("span", formatCount(R.ExecutionCount), TrueCovClass) << ", "; - else - OS << format("%0.2f", TruePercent) << "%, "; + (R.TrueFolded || R.ExecutionCount > 0 ? "covered-line" + : "uncovered-line"); + + if (R.TrueFolded) + OS << "Folded, "; + else { + OS << tag("span", "True", TrueColor) << ": "; + if (getOptions().ShowBranchCounts) + OS << tag("span", formatCount(R.ExecutionCount), TrueCovClass) << ", "; + else + OS << format("%0.2f", TruePercent) << "%, "; + } // Display FalseCount or FalsePercent. - std::string FalseColor = R.FalseExecutionCount ? "None" : "red branch"; + std::string FalseColor = + (R.FalseFolded || R.FalseExecutionCount ? "None" : "red branch"); std::string FalseCovClass = - (R.FalseExecutionCount > 0) ? "covered-line" : "uncovered-line"; - - OS << tag("span", "False", FalseColor); - OS << ": "; - if (getOptions().ShowBranchCounts) - OS << tag("span", formatCount(R.FalseExecutionCount), FalseCovClass); - else - OS << format("%0.2f", FalsePercent) << "%"; - - OS << "]\n"; + (R.FalseFolded || R.FalseExecutionCount > 0 ? "covered-line" + : "uncovered-line"); + + if (R.FalseFolded) + OS << "Folded]\n"; + else { + OS << tag("span", "False", FalseColor) << ": "; + if (getOptions().ShowBranchCounts) + OS << tag("span", formatCount(R.FalseExecutionCount), FalseCovClass) + << "]\n"; + else + OS << format("%0.2f", FalsePercent) << "%]\n"; + } } OS << EndPre; OS << EndExpansionDiv; diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp index 8b93b592910b..444f33dac108 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp @@ -309,31 +309,38 @@ void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV, renderLinePrefix(OS, ViewDepth); OS << " Branch (" << R.LineStart << ":" << R.ColumnStart << "): ["; - if (R.Folded) { + if (R.TrueFolded && R.FalseFolded) { OS << "Folded - Ignored]\n"; continue; } - colored_ostream(OS, raw_ostream::RED, - getOptions().Colors && !R.ExecutionCount, - /*Bold=*/false, /*BG=*/true) - << "True"; - - if (getOptions().ShowBranchCounts) - OS << ": " << formatCount(R.ExecutionCount) << ", "; - else - OS << ": " << format("%0.2f", TruePercent) << "%, "; - - colored_ostream(OS, raw_ostream::RED, - getOptions().Colors && !R.FalseExecutionCount, - /*Bold=*/false, /*BG=*/true) - << "False"; + if (R.TrueFolded) + OS << "Folded, "; + else { + colored_ostream(OS, raw_ostream::RED, + getOptions().Colors && !R.ExecutionCount, + /*Bold=*/false, /*BG=*/true) + << "True"; + + if (getOptions().ShowBranchCounts) + OS << ": " << formatCount(R.ExecutionCount) << ", "; + else + OS << ": " << format("%0.2f", TruePercent) << "%, "; + } - if (getOptions().ShowBranchCounts) - OS << ": " << formatCount(R.FalseExecutionCount); - else - OS << ": " << format("%0.2f", FalsePercent) << "%"; - OS << "]\n"; + if (R.FalseFolded) + OS << "Folded]\n"; + else { + colored_ostream(OS, raw_ostream::RED, + getOptions().Colors && !R.FalseExecutionCount, + /*Bold=*/false, /*BG=*/true) + << "False"; + + if (getOptions().ShowBranchCounts) + OS << ": " << formatCount(R.FalseExecutionCount) << "]\n"; + else + OS << ": " << format("%0.2f", FalsePercent) << "%]\n"; + } } } -- GitLab From 861bd36bce3c3e1384b87b0366cf83e2c022c325 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 19 Oct 2024 20:32:58 -0700 Subject: [PATCH 178/511] [ELF] Pass Ctx & to Symbol::getVA --- lld/ELF/AArch64ErrataFix.cpp | 2 +- lld/ELF/ARMErrataFix.cpp | 4 +-- lld/ELF/Arch/AArch64.cpp | 12 +++---- lld/ELF/Arch/ARM.cpp | 25 +++++++------ lld/ELF/Arch/AVR.cpp | 2 +- lld/ELF/Arch/LoongArch.cpp | 4 +-- lld/ELF/Arch/Mips.cpp | 2 +- lld/ELF/Arch/PPC.cpp | 2 +- lld/ELF/Arch/PPC64.cpp | 4 +-- lld/ELF/Arch/RISCV.cpp | 17 ++++----- lld/ELF/Arch/SystemZ.cpp | 2 +- lld/ELF/Arch/X86.cpp | 2 +- lld/ELF/Arch/X86_64.cpp | 2 +- lld/ELF/InputSection.cpp | 44 ++++++++++++----------- lld/ELF/MapFile.cpp | 4 +-- lld/ELF/OutputSections.cpp | 2 +- lld/ELF/Relocations.cpp | 5 +-- lld/ELF/Symbols.cpp | 6 ++-- lld/ELF/Symbols.h | 2 +- lld/ELF/SyntheticSections.cpp | 29 +++++++-------- lld/ELF/Thunks.cpp | 68 +++++++++++++++++------------------ lld/ELF/Writer.cpp | 6 ++-- 22 files changed, 125 insertions(+), 121 deletions(-) diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp index cd8fbf16f5b8..f9e03ce5bbe4 100644 --- a/lld/ELF/AArch64ErrataFix.cpp +++ b/lld/ELF/AArch64ErrataFix.cpp @@ -417,7 +417,7 @@ void Patch843419Section::writeTo(uint8_t *buf) { // Return address is the next instruction after the one we have just copied. uint64_t s = getLDSTAddr() + 4; - uint64_t p = patchSym->getVA() + 4; + uint64_t p = patchSym->getVA(ctx) + 4; ctx.target->relocateNoSym(buf + 4, R_AARCH64_JUMP26, s - p); } diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp index 630084afd509..6d759d7dec1d 100644 --- a/lld/ELF/ARMErrataFix.cpp +++ b/lld/ELF/ARMErrataFix.cpp @@ -218,7 +218,7 @@ static bool branchDestInFirstRegion(Ctx &ctx, const InputSection *isec, // or the PLT. if (r) { uint64_t dst = - (r->expr == R_PLT_PC) ? r->sym->getPltVA(ctx) : r->sym->getVA(); + r->expr == R_PLT_PC ? r->sym->getPltVA(ctx) : r->sym->getVA(ctx); // Account for Thumb PC bias, usually cancelled to 0 by addend of -4. destAddr = dst + r->addend + 4; } else { @@ -449,7 +449,7 @@ static void implementPatch(ScanResult sr, InputSection *isec, // Thunk from the patch to the target. uint64_t dstSymAddr = (sr.rel->expr == R_PLT_PC) ? sr.rel->sym->getPltVA(ctx) - : sr.rel->sym->getVA(); + : sr.rel->sym->getVA(ctx); destIsARM = (dstSymAddr & 1) == 0; } psec = make(ctx, isec, sr.off, sr.instr, destIsARM); diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index 260307ac4c3d..f4f867d01913 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -360,7 +360,7 @@ void AArch64::writeGotPlt(uint8_t *buf, const Symbol &) const { void AArch64::writeIgotPlt(uint8_t *buf, const Symbol &s) const { if (ctx.arg.writeAddends) - write64(ctx, buf, s.getVA()); + write64(ctx, buf, s.getVA(ctx)); } void AArch64::writePltHeader(uint8_t *buf) const { @@ -416,7 +416,7 @@ bool AArch64::needsThunk(RelExpr expr, RelType type, const InputFile *file, if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26 && type != R_AARCH64_PLT32) return false; - uint64_t dst = expr == R_PLT_PC ? s.getPltVA(ctx) : s.getVA(a); + uint64_t dst = expr == R_PLT_PC ? s.getPltVA(ctx) : s.getVA(ctx, a); return !inBranchRange(type, branchAddr, dst); } @@ -808,7 +808,7 @@ bool AArch64Relaxer::tryRelaxAdrpAdd(const Relocation &adrpRel, Symbol &sym = *adrpRel.sym; // Check if the address difference is within 1MiB range. - int64_t val = sym.getVA() - (secAddr + addRel.offset); + int64_t val = sym.getVA(ctx) - (secAddr + addRel.offset); if (val < -1024 * 1024 || val >= 1024 * 1024) return false; @@ -874,7 +874,7 @@ bool AArch64Relaxer::tryRelaxAdrpLdr(const Relocation &adrpRel, return false; // Check if the address difference is within 4GB range. int64_t val = - getAArch64Page(sym.getVA()) - getAArch64Page(secAddr + adrpRel.offset); + getAArch64Page(sym.getVA(ctx)) - getAArch64Page(secAddr + adrpRel.offset); if (val != llvm::SignExtend64(val, 33)) return false; @@ -890,11 +890,11 @@ bool AArch64Relaxer::tryRelaxAdrpLdr(const Relocation &adrpRel, ctx.target->relocate( buf + adrpSymRel.offset, adrpSymRel, - SignExtend64(getAArch64Page(sym.getVA()) - + SignExtend64(getAArch64Page(sym.getVA(ctx)) - getAArch64Page(secAddr + adrpSymRel.offset), 64)); ctx.target->relocate(buf + addRel.offset, addRel, - SignExtend64(sym.getVA(), 64)); + SignExtend64(sym.getVA(ctx), 64)); tryRelaxAdrpAdd(adrpSymRel, addRel, secAddr, buf); return true; } diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 1cc396aa395d..be3f80337aae 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -213,7 +213,7 @@ void ARM::writeGotPlt(uint8_t *buf, const Symbol &) const { void ARM::writeIgotPlt(uint8_t *buf, const Symbol &s) const { // An ARM entry is the address of the ifunc resolver function. - write32(ctx, buf, s.getVA()); + write32(ctx, buf, s.getVA(ctx)); } // Long form PLT Header that does not have any restrictions on the displacement @@ -404,26 +404,26 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb). assert(!useThumbPLTs(ctx) && "If the source is ARM, we should not need Thumb PLTs"); - if (s.isFunc() && expr == R_PC && (s.getVA() & 1)) + if (s.isFunc() && expr == R_PC && (s.getVA(ctx) & 1)) return true; [[fallthrough]]; case R_ARM_CALL: { - uint64_t dst = (expr == R_PLT_PC) ? s.getPltVA(ctx) : s.getVA(); + uint64_t dst = (expr == R_PLT_PC) ? s.getPltVA(ctx) : s.getVA(ctx); return !inBranchRange(type, branchAddr, dst + a) || - (!ctx.arg.armHasBlx && (s.getVA() & 1)); + (!ctx.arg.armHasBlx && (s.getVA(ctx) & 1)); } case R_ARM_THM_JUMP19: case R_ARM_THM_JUMP24: // Source is Thumb, when all PLT entries are ARM interworking is required. // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM). if ((expr == R_PLT_PC && !useThumbPLTs(ctx)) || - (s.isFunc() && (s.getVA() & 1) == 0)) + (s.isFunc() && (s.getVA(ctx) & 1) == 0)) return true; [[fallthrough]]; case R_ARM_THM_CALL: { - uint64_t dst = (expr == R_PLT_PC) ? s.getPltVA(ctx) : s.getVA(); + uint64_t dst = (expr == R_PLT_PC) ? s.getPltVA(ctx) : s.getVA(ctx); return !inBranchRange(type, branchAddr, dst + a) || - (!ctx.arg.armHasBlx && (s.getVA() & 1) == 0);; + (!ctx.arg.armHasBlx && (s.getVA(ctx) & 1) == 0); } } return false; @@ -1399,7 +1399,7 @@ void ArmCmseSGSection::writeTo(uint8_t *buf) { write16(ctx, p + 4, 0xf000); // B.W S write16(ctx, p + 6, 0xb000); ctx.target->relocateNoSym(p + 4, R_ARM_THM_JUMP24, - s->acleSeSym->getVA() - + s->acleSeSym->getVA(ctx) - (getVA() + s->offset + s->size)); } } @@ -1466,16 +1466,15 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { osIsPairs.emplace_back(make(ctx, shstrtab->name, 0, 0), shstrtab); - std::sort(ctx.symtab->cmseSymMap.begin(), ctx.symtab->cmseSymMap.end(), - [](const auto &a, const auto &b) -> bool { - return a.second.sym->getVA() < b.second.sym->getVA(); - }); + llvm::sort(ctx.symtab->cmseSymMap, [&](const auto &a, const auto &b) { + return a.second.sym->getVA(ctx) < b.second.sym->getVA(ctx); + }); // Copy the secure gateway entry symbols to the import library symbol table. for (auto &p : ctx.symtab->cmseSymMap) { Defined *d = cast(p.second.sym); impSymTab->addSymbol(makeDefined( ctx, ctx.internalFile, d->getName(), d->computeBinding(ctx), - /*stOther=*/0, STT_FUNC, d->getVA(), d->getSize(), nullptr)); + /*stOther=*/0, STT_FUNC, d->getVA(ctx), d->getSize(), nullptr)); } size_t idx = 0; diff --git a/lld/ELF/Arch/AVR.cpp b/lld/ELF/Arch/AVR.cpp index 4dc605c47059..64790f1ce83a 100644 --- a/lld/ELF/Arch/AVR.cpp +++ b/lld/ELF/Arch/AVR.cpp @@ -110,7 +110,7 @@ bool AVR::needsThunk(RelExpr expr, RelType type, const InputFile *file, case R_AVR_HI8_LDI_GS: // A thunk is needed if the symbol's virtual address is out of range // [0, 0x1ffff]. - return s.getVA() >= 0x20000; + return s.getVA(ctx) >= 0x20000; default: return false; } diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 5923cda2298b..876aadcb9151 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -316,9 +316,9 @@ void LoongArch::writeGotPlt(uint8_t *buf, const Symbol &s) const { void LoongArch::writeIgotPlt(uint8_t *buf, const Symbol &s) const { if (ctx.arg.writeAddends) { if (ctx.arg.is64) - write64le(buf, s.getVA()); + write64le(buf, s.getVA(ctx)); else - write32le(buf, s.getVA()); + write32le(buf, s.getVA(ctx)); } } diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp index 1d3000793ca2..d84e85239d2e 100644 --- a/lld/ELF/Arch/Mips.cpp +++ b/lld/ELF/Arch/Mips.cpp @@ -96,7 +96,7 @@ RelExpr MIPS::getRelExpr(RelType type, const Symbol &s, // If the target symbol is not preemptible and is not microMIPS, // it might be possible to replace jalr/jr instruction by bal/b. // It depends on the target symbol's offset. - if (!s.isPreemptible && !(s.getVA() & 0x1)) + if (!s.isPreemptible && !(s.getVA(ctx) & 0x1)) return R_PC; return R_NONE; case R_MICROMIPS_JALR: diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp index 3af4101fff60..2cd526020f7d 100644 --- a/lld/ELF/Arch/PPC.cpp +++ b/lld/ELF/Arch/PPC.cpp @@ -209,7 +209,7 @@ bool PPC::needsThunk(RelExpr expr, RelType type, const InputFile *file, return true; if (s.isUndefWeak()) return false; - return !PPC::inBranchRange(type, branchAddr, s.getVA(a)); + return !PPC::inBranchRange(type, branchAddr, s.getVA(ctx, a)); } uint32_t PPC::getThunkSectionSpacing() const { return 0x2000000; } diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index 9f550745f93b..d0f59681ccbd 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -404,7 +404,7 @@ static bool tryRelaxPPC64TocIndirection(Ctx &ctx, const Relocation &rel, assert(!d->isGnuIFunc()); // Two instructions can materialize a 32-bit signed offset from the toc base. - uint64_t tocRelative = d->getVA(addend) - getPPC64TocBase(ctx); + uint64_t tocRelative = d->getVA(ctx, addend) - getPPC64TocBase(ctx); if (!isInt<32>(tocRelative)) return false; @@ -1452,7 +1452,7 @@ bool PPC64::needsThunk(RelExpr expr, RelType type, const InputFile *file, // a range-extending thunk. // See the comment in getRelocTargetVA() about R_PPC64_CALL. return !inBranchRange(type, branchAddr, - s.getVA(a) + + s.getVA(ctx, a) + getPPC64GlobalEntryToLocalEntryOffset(s.stOther)); } diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 1ae016e4de01..7ebb67c36123 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -214,9 +214,9 @@ void RISCV::writeGotPlt(uint8_t *buf, const Symbol &s) const { void RISCV::writeIgotPlt(uint8_t *buf, const Symbol &s) const { if (ctx.arg.writeAddends) { if (ctx.arg.is64) - write64le(buf, s.getVA()); + write64le(buf, s.getVA(ctx)); else - write32le(buf, s.getVA()); + write32le(buf, s.getVA(ctx)); } } @@ -466,7 +466,7 @@ void RISCV::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { case INTERNAL_R_RISCV_GPREL_I: case INTERNAL_R_RISCV_GPREL_S: { Defined *gp = ctx.sym.riscvGlobalPointer; - int64_t displace = SignExtend64(val - gp->getVA(), bits); + int64_t displace = SignExtend64(val - gp->getVA(ctx), bits); checkInt(ctx, loc, displace, 12, rel); uint32_t insn = (read32le(loc) & ~(31 << 15)) | (X_GP << 15); if (rel.type == INTERNAL_R_RISCV_GPREL_I) @@ -657,7 +657,8 @@ void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { const Relocation &rel1 = relocs[i + 1]; if (rel.type == R_RISCV_SET_ULEB128 && rel1.type == R_RISCV_SUB_ULEB128 && rel.offset == rel1.offset) { - auto val = rel.sym->getVA(rel.addend) - rel1.sym->getVA(rel1.addend); + auto val = rel.sym->getVA(ctx, rel.addend) - + rel1.sym->getVA(ctx, rel1.addend); if (overwriteULEB128(loc, val) >= 0x80) errorOrWarn(sec.getLocation(rel.offset) + ": ULEB128 value " + Twine(val) + " exceeds available space; references '" + @@ -737,7 +738,7 @@ static void relaxCall(Ctx &ctx, const InputSection &sec, size_t i, uint64_t loc, const uint64_t insnPair = read64le(sec.content().data() + r.offset); const uint32_t rd = extractBits(insnPair, 32 + 11, 32 + 7); const uint64_t dest = - (r.expr == R_PLT_PC ? sym.getPltVA(ctx) : sym.getVA()) + r.addend; + (r.expr == R_PLT_PC ? sym.getPltVA(ctx) : sym.getVA(ctx)) + r.addend; const int64_t displace = dest - loc; if (rvc && isInt<12>(displace) && rd == 0) { @@ -759,7 +760,7 @@ static void relaxCall(Ctx &ctx, const InputSection &sec, size_t i, uint64_t loc, // Relax local-exec TLS when hi20 is zero. static void relaxTlsLe(const InputSection &sec, size_t i, uint64_t loc, Relocation &r, uint32_t &remove) { - uint64_t val = r.sym->getVA(r.addend); + uint64_t val = r.sym->getVA(ctx, r.addend); if (hi20(val) != 0) return; uint32_t insn = read32le(sec.content().data() + r.offset); @@ -791,7 +792,7 @@ static void relaxHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, if (!gp) return; - if (!isInt<12>(r.sym->getVA(r.addend) - gp->getVA())) + if (!isInt<12>(r.sym->getVA(ctx, r.addend) - gp->getVA(ctx))) return; switch (r.type) { @@ -863,7 +864,7 @@ static bool relax(Ctx &ctx, InputSection &sec) { // For TLSDESC=>LE, we can use the short form if hi20 is zero. tlsdescRelax = relaxable(relocs, i); toLeShortForm = tlsdescRelax && r.expr == R_RELAX_TLS_GD_TO_LE && - !hi20(r.sym->getVA(r.addend)); + !hi20(r.sym->getVA(ctx, r.addend)); [[fallthrough]]; case R_RISCV_TLSDESC_LOAD_LO12: // For TLSDESC=>LE/IE, AUIPC and L[DW] are removed if relaxable. diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp index 584379638ad9..106b530c31b2 100644 --- a/lld/ELF/Arch/SystemZ.cpp +++ b/lld/ELF/Arch/SystemZ.cpp @@ -188,7 +188,7 @@ void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const { void SystemZ::writeIgotPlt(uint8_t *buf, const Symbol &s) const { if (ctx.arg.writeAddends) - write64be(buf, s.getVA()); + write64be(buf, s.getVA(ctx)); } void SystemZ::writePltHeader(uint8_t *buf) const { diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp index 58199cdb99a2..a36212a5b169 100644 --- a/lld/ELF/Arch/X86.cpp +++ b/lld/ELF/Arch/X86.cpp @@ -181,7 +181,7 @@ void X86::writeGotPlt(uint8_t *buf, const Symbol &s) const { void X86::writeIgotPlt(uint8_t *buf, const Symbol &s) const { // An x86 entry is the address of the ifunc resolver function. - write32le(buf, s.getVA()); + write32le(buf, s.getVA(ctx)); } RelType X86::getDynRel(RelType type) const { diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index df2983f20228..d32ba638b740 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -429,7 +429,7 @@ void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const { void X86_64::writeIgotPlt(uint8_t *buf, const Symbol &s) const { // An x86 entry is the address of the ifunc resolver function (for -z rel). if (ctx.arg.writeAddends) - write64le(buf, s.getVA()); + write64le(buf, s.getVA(ctx)); } void X86_64::writePltHeader(uint8_t *buf) const { diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 2e9e8a7007bb..6c34471a9e50 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -508,7 +508,8 @@ void InputSection::copyRelocations(Ctx &ctx, uint8_t *buf, } if (RelTy::HasAddend) - p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr; + p->r_addend = + sym.getVA(ctx, addend) - section->getOutputSection()->addr; // For SHF_ALLOC sections relocated by REL, append a relocation to // sec->relocations so that relocateAlloc transitively called by // writeSections will update the implicit addend. Non-SHF_ALLOC sections @@ -701,7 +702,7 @@ static int64_t getTlsTpOffset(Ctx &ctx, const Symbol &s) { // Variant 1. case EM_ARM: case EM_AARCH64: - return s.getVA(0) + ctx.arg.wordsize * 2 + + return s.getVA(ctx, 0) + ctx.arg.wordsize * 2 + ((tls->p_vaddr - ctx.arg.wordsize * 2) & (tls->p_align - 1)); case EM_MIPS: case EM_PPC: @@ -709,7 +710,7 @@ static int64_t getTlsTpOffset(Ctx &ctx, const Symbol &s) { // Adjusted Variant 1. TP is placed with a displacement of 0x7000, which is // to allow a signed 16-bit offset to reach 0x1000 of TCB/thread-library // data and 0xf000 of the program's TLS segment. - return s.getVA(0) + (tls->p_vaddr & (tls->p_align - 1)) - 0x7000; + return s.getVA(ctx, 0) + (tls->p_vaddr & (tls->p_align - 1)) - 0x7000; case EM_LOONGARCH: case EM_RISCV: // See the comment in handleTlsRelocation. For TLSDESC=>IE, @@ -717,7 +718,7 @@ static int64_t getTlsTpOffset(Ctx &ctx, const Symbol &s) { // `tls` may be null, the return value is ignored. if (s.type != STT_TLS) return 0; - return s.getVA(0) + (tls->p_vaddr & (tls->p_align - 1)); + return s.getVA(ctx, 0) + (tls->p_vaddr & (tls->p_align - 1)); // Variant 2. case EM_HEXAGON: @@ -725,7 +726,7 @@ static int64_t getTlsTpOffset(Ctx &ctx, const Symbol &s) { case EM_SPARCV9: case EM_386: case EM_X86_64: - return s.getVA(0) - tls->p_memsz - + return s.getVA(ctx, 0) - tls->p_memsz - ((-tls->p_vaddr - tls->p_memsz) & (tls->p_align - 1)); default: llvm_unreachable("unhandled ctx.arg.emachine"); @@ -743,13 +744,13 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, case R_AARCH64_AUTH: case R_RISCV_ADD: case R_RISCV_LEB128: - return r.sym->getVA(a); + return r.sym->getVA(ctx, a); case R_ADDEND: return a; case R_RELAX_HINT: return 0; case R_ARM_SBREL: - return r.sym->getVA(a) - getARMStaticBase(*r.sym); + return r.sym->getVA(ctx, a) - getARMStaticBase(*r.sym); case R_GOT: case R_RELAX_TLS_GD_TO_IE_ABS: return r.sym->getGotVA(ctx) + a; @@ -767,9 +768,9 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, return ctx.in.gotPlt->getVA() + a - p; case R_GOTREL: case R_PPC64_RELAX_TOC: - return r.sym->getVA(a) - ctx.in.got->getVA(); + return r.sym->getVA(ctx, a) - ctx.in.got->getVA(); case R_GOTPLTREL: - return r.sym->getVA(a) - ctx.in.gotPlt->getVA(); + return r.sym->getVA(ctx, a) - ctx.in.gotPlt->getVA(); case R_GOTPLT: case R_RELAX_TLS_GD_TO_IE_GOTPLT: return r.sym->getGotVA(ctx) + a - ctx.in.gotPlt->getVA(); @@ -795,7 +796,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, r.type); return getLoongArchPageDelta(r.sym->getGotVA(ctx) + a, p, r.type); case R_MIPS_GOTREL: - return r.sym->getVA(a) - ctx.in.mipsGot->getGp(file); + return r.sym->getVA(ctx, a) - ctx.in.mipsGot->getGp(file); case R_MIPS_GOT_GP: return ctx.in.mipsGot->getGp(file) + a; case R_MIPS_GOT_GP_PC: { @@ -836,16 +837,16 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, return ctx.in.mipsGot->getVA() + ctx.in.mipsGot->getTlsIndexOffset(file) - ctx.in.mipsGot->getGp(file); case R_AARCH64_PAGE_PC: { - uint64_t val = r.sym->isUndefWeak() ? p + a : r.sym->getVA(a); + uint64_t val = r.sym->isUndefWeak() ? p + a : r.sym->getVA(ctx, a); return getAArch64Page(val) - getAArch64Page(p); } case R_RISCV_PC_INDIRECT: { if (const Relocation *hiRel = getRISCVPCRelHi20(this, r)) - return getRelocTargetVA(ctx, *hiRel, r.sym->getVA()); + return getRelocTargetVA(ctx, *hiRel, r.sym->getVA(ctx)); return 0; } case R_LOONGARCH_PAGE_PC: - return getLoongArchPageDelta(r.sym->getVA(a), p, r.type); + return getLoongArchPageDelta(r.sym->getVA(ctx, a), p, r.type); case R_PC: case R_ARM_PCA: { uint64_t dest; @@ -868,9 +869,9 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, else if (ctx.arg.emachine == EM_RISCV) dest = getRISCVUndefinedRelativeWeakVA(r.type, p) + a; else - dest = r.sym->getVA(a); + dest = r.sym->getVA(ctx, a); } else { - dest = r.sym->getVA(a); + dest = r.sym->getVA(ctx, a); } return dest - p; } @@ -891,7 +892,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, // target VA computation. return r.sym->getPltVA(ctx) - p; case R_PPC64_CALL: { - uint64_t symVA = r.sym->getVA(a); + uint64_t symVA = r.sym->getVA(ctx, a); // If we have an undefined weak symbol, we might get here with a symbol // address of zero. That could overflow, but the code must be unreachable, // so don't bother doing anything at all. @@ -910,7 +911,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r, return getPPC64TocBase(ctx) + a; case R_RELAX_GOT_PC: case R_PPC64_RELAX_GOT_PC: - return r.sym->getVA(a) - p; + return r.sym->getVA(ctx, a) - p; case R_RELAX_TLS_GD_TO_LE: case R_RELAX_TLS_IE_TO_LE: case R_RELAX_TLS_LD_TO_LE: @@ -1016,8 +1017,8 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf, if (!ds && tombstone) { val = *tombstone; } else { - val = sym.getVA(addend) - - (f->getRelocTargetSym(*it).getVA(0) + getAddend(*it)); + val = sym.getVA(ctx, addend) - + (f->getRelocTargetSym(*it).getVA(ctx) + getAddend(*it)); } if (overwriteULEB128(bufLoc, val) >= 0x80) errorOrWarn(getLocation(offset) + ": ULEB128 value " + Twine(val) + @@ -1083,7 +1084,8 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf, // sections. if (LLVM_LIKELY(expr == R_ABS) || expr == R_DTPREL || expr == R_GOTPLTREL || expr == R_RISCV_ADD) { - target.relocateNoSym(bufLoc, type, SignExtend64(sym.getVA(addend))); + target.relocateNoSym(bufLoc, type, + SignExtend64(sym.getVA(ctx, addend))); continue; } @@ -1116,7 +1118,7 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf, warn(msg); target.relocateNoSym( bufLoc, type, - SignExtend64(sym.getVA(addend - offset - outSecOff))); + SignExtend64(sym.getVA(ctx, addend - offset - outSecOff))); } } diff --git a/lld/ELF/MapFile.cpp b/lld/ELF/MapFile.cpp index 6bbc1ecc646f..84bc95f9bd00 100644 --- a/lld/ELF/MapFile.cpp +++ b/lld/ELF/MapFile.cpp @@ -68,7 +68,7 @@ static std::vector getSymbols(Ctx &ctx) { static SymbolMapTy getSectionSyms(ArrayRef syms) { SymbolMapTy ret; for (Defined *dr : syms) - ret[dr->section].emplace_back(dr, dr->getVA()); + ret[dr->section].emplace_back(dr, dr->getVA(ctx)); // Sort symbols by address. We want to print out symbols in the // order in the output file rather than the order they appeared @@ -95,7 +95,7 @@ getSymbolStrings(Ctx &ctx, ArrayRef syms) { parallelFor(0, syms.size(), [&](size_t i) { raw_string_ostream os(strs[i]); OutputSection *osec = syms[i]->getOutputSection(); - uint64_t vma = syms[i]->getVA(); + uint64_t vma = syms[i]->getVA(ctx); uint64_t lma = osec ? osec->getLMA() + vma - osec->getVA(0) : 0; writeHeader(ctx, os, vma, lma, syms[i]->getSize(), 1); os << indent16 << toString(*syms[i]); diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 309039fe7e20..6f76c5d73a53 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -624,7 +624,7 @@ encodeOneCrel(Ctx &ctx, raw_svector_ostream &os, if (d) { SectionBase *section = d->section; assert(section->isLive()); - addend = sym.getVA(addend) - section->getOutputSection()->addr; + addend = sym.getVA(ctx, addend) - section->getOutputSection()->addr; } else { // Encode R_*_NONE(symidx=0). symidx = type = addend = 0; diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index c8dcc276c30a..d40348a7b30d 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -2257,7 +2257,7 @@ std::pair ThunkCreator::getThunk(InputSection *isec, if (isThunkSectionCompatible(isec, t->getThunkTargetSym()->section) && t->isCompatibleWith(*isec, rel) && ctx.target->inBranchRange(rel.type, src, - t->getThunkTargetSym()->getVA(-pcBias))) + t->getThunkTargetSym()->getVA(ctx, -pcBias))) return std::make_pair(t, false); // No existing compatible Thunk in range, create a new one @@ -2281,7 +2281,8 @@ std::pair ThunkCreator::getSyntheticLandingPad(Defined &d, // relocation back to its original non-Thunk target. bool ThunkCreator::normalizeExistingThunk(Relocation &rel, uint64_t src) { if (Thunk *t = thunks.lookup(rel.sym)) { - if (ctx.target->inBranchRange(rel.type, src, rel.sym->getVA(rel.addend))) + if (ctx.target->inBranchRange(rel.type, src, + rel.sym->getVA(ctx, rel.addend))) return true; rel.sym = &t->destination; rel.addend = t->addend; diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp index 3caa609338e0..6d9b3c839f86 100644 --- a/lld/ELF/Symbols.cpp +++ b/lld/ELF/Symbols.cpp @@ -58,7 +58,7 @@ std::string lld::toString(const elf::Symbol &sym) { return ret; } -static uint64_t getSymVA(const Symbol &sym, int64_t addend) { +static uint64_t getSymVA(Ctx &ctx, const Symbol &sym, int64_t addend) { switch (sym.kind()) { case Symbol::DefinedKind: { auto &d = cast(sym); @@ -141,8 +141,8 @@ static uint64_t getSymVA(const Symbol &sym, int64_t addend) { llvm_unreachable("invalid symbol kind"); } -uint64_t Symbol::getVA(int64_t addend) const { - return getSymVA(*this, addend) + addend; +uint64_t Symbol::getVA(Ctx &ctx, int64_t addend) const { + return getSymVA(ctx, *this, addend) + addend; } uint64_t Symbol::getGotVA(Ctx &ctx) const { diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h index 86abebe79f8d..339f32e05f16 100644 --- a/lld/ELF/Symbols.h +++ b/lld/ELF/Symbols.h @@ -210,7 +210,7 @@ public: bool isInGot(Ctx &ctx) const { return getGotIdx(ctx) != uint32_t(-1); } bool isInPlt(Ctx &ctx) const { return getPltIdx(ctx) != uint32_t(-1); } - uint64_t getVA(int64_t addend = 0) const; + uint64_t getVA(Ctx &, int64_t addend = 0) const; uint64_t getGotOffset(Ctx &) const; uint64_t getGotVA(Ctx &) const; diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index f50404ed3016..7a344635a1cb 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -749,7 +749,7 @@ void MipsGotSection::addEntry(InputFile &file, Symbol &sym, int64_t addend, if (const OutputSection *os = sym.getOutputSection()) g.pagesMap.insert({os, {}}); else - g.local16.insert({{nullptr, getMipsPageAddr(sym.getVA(addend))}, 0}); + g.local16.insert({{nullptr, getMipsPageAddr(sym.getVA(ctx, addend))}, 0}); } else if (sym.isTls()) g.tls.insert({&sym, 0}); else if (sym.isPreemptible && expr == R_ABS) @@ -808,10 +808,11 @@ uint64_t MipsGotSection::getPageEntryOffset(const InputFile *f, uint64_t index = 0; if (const OutputSection *outSec = sym.getOutputSection()) { uint64_t secAddr = getMipsPageAddr(outSec->addr); - uint64_t symAddr = getMipsPageAddr(sym.getVA(addend)); + uint64_t symAddr = getMipsPageAddr(sym.getVA(ctx, addend)); index = g.pagesMap.lookup(outSec).firstIndex + (symAddr - secAddr) / 0xffff; } else { - index = g.local16.lookup({nullptr, getMipsPageAddr(sym.getVA(addend))}); + index = + g.local16.lookup({nullptr, getMipsPageAddr(sym.getVA(ctx, addend))}); } return index * ctx.arg.wordsize; } @@ -1099,7 +1100,7 @@ uint64_t MipsGotSection::getGp(const InputFile *f) const { // returns "common" _gp value. For secondary GOTs calculate // individual _gp values. if (!f || f->mipsGotIndex == uint32_t(-1) || f->mipsGotIndex == 0) - return ctx.sym.mipsGp->getVA(0); + return ctx.sym.mipsGp->getVA(ctx, 0); return getVA() + gots[f->mipsGotIndex].startIndex * ctx.arg.wordsize + 0x7ff0; } @@ -1124,7 +1125,7 @@ void MipsGotSection::writeTo(uint8_t *buf) { auto write = [&](size_t i, const Symbol *s, int64_t a) { uint64_t va = a; if (s) - va = s->getVA(a); + va = s->getVA(ctx, a); writeUint(ctx, buf + i * ctx.arg.wordsize, va); }; // Write 'page address' entries to the local part of the GOT. @@ -1522,10 +1523,10 @@ DynamicSection::computeContents() { if (Symbol *b = ctx.symtab->find(ctx.arg.init)) if (b->isDefined()) - addInt(DT_INIT, b->getVA()); + addInt(DT_INIT, b->getVA(ctx)); if (Symbol *b = ctx.symtab->find(ctx.arg.fini)) if (b->isDefined()) - addInt(DT_FINI, b->getVA()); + addInt(DT_FINI, b->getVA(ctx)); } if (part.verSym && part.verSym->isNeeded()) @@ -2288,7 +2289,7 @@ template void SymbolTableSection::writeTo(uint8_t *buf) { const uint32_t shndx = getSymSectionIndex(sym); if (isDefinedHere) { eSym->st_shndx = shndx; - eSym->st_value = sym->getVA(); + eSym->st_value = sym->getVA(ctx); // Copy symbol size if it is a defined symbol. st_size is not // significant for undefined symbols, so whether copying it or not is up // to us if that's the case. We'll leave it as zero because by not @@ -3241,7 +3242,7 @@ void DebugNamesSection::getNameRelocs( Relocs rels) { for (const RelTy &rel : rels) { Symbol &sym = file.getRelocTargetSym(rel); - relocs[rel.r_offset] = sym.getVA(getAddend(rel)); + relocs[rel.r_offset] = sym.getVA(ctx, getAddend(rel)); } } @@ -4356,11 +4357,11 @@ void PPC64LongBranchTargetSection::writeTo(uint8_t *buf) { for (auto entry : entries) { const Symbol *sym = entry.first; int64_t addend = entry.second; - assert(sym->getVA()); + assert(sym->getVA(ctx)); // Need calls to branch to the local entry-point since a long-branch // must be a local-call. write64(ctx, buf, - sym->getVA(addend) + + sym->getVA(ctx, addend) + getPPC64GlobalEntryToLocalEntryOffset(sym->stOther)); buf += 8; } @@ -4616,7 +4617,7 @@ createMemtagGlobalDescriptors(Ctx &ctx, for (const Symbol *sym : symbols) { if (!includeInSymtab(ctx, *sym)) continue; - const uint64_t addr = sym->getVA(); + const uint64_t addr = sym->getVA(ctx); const uint64_t size = sym->getSize(); if (addr <= kMemtagGranuleSize && buf != nullptr) @@ -4653,8 +4654,8 @@ createMemtagGlobalDescriptors(Ctx &ctx, bool MemtagGlobalDescriptors::updateAllocSize(Ctx &ctx) { size_t oldSize = getSize(); std::stable_sort(symbols.begin(), symbols.end(), - [](const Symbol *s1, const Symbol *s2) { - return s1->getVA() < s2->getVA(); + [&ctx = ctx](const Symbol *s1, const Symbol *s2) { + return s1->getVA(ctx) < s2->getVA(ctx); }); return oldSize != getSize(); } diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 971b2724b3e2..94c0b2409c6c 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -464,7 +464,7 @@ private: // This is similar to the handling for ARMThunk. bool mayUseShortThunk = true; int64_t computeOffset() const { - return destination.getVA() - (getThunkTargetSym()->getVA() + 4); + return destination.getVA(ctx) - (getThunkTargetSym()->getVA(ctx) + 4); } }; @@ -550,7 +550,7 @@ void Thunk::setOffset(uint64_t newOffset) { // AArch64 Thunk base class. static uint64_t getAArch64ThunkDestVA(Ctx &ctx, const Symbol &s, int64_t a) { - uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(a); + uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(ctx, a); return v; } @@ -558,7 +558,7 @@ bool AArch64Thunk::getMayUseShortThunk() { if (!mayUseShortThunk) return false; uint64_t s = getAArch64ThunkDestVA(ctx, destination, addend); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); mayUseShortThunk = llvm::isInt<28>(s - p); return mayUseShortThunk; } @@ -569,7 +569,7 @@ void AArch64Thunk::writeTo(uint8_t *buf) { return; } uint64_t s = getAArch64ThunkDestVA(ctx, destination, addend); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); write32(ctx, buf, 0x14000000); // b S ctx.target->relocateNoSym(buf, R_AARCH64_CALL26, s - p); } @@ -592,7 +592,7 @@ void AArch64ABSLongThunk::writeLong(uint8_t *buf) { // AArch64BTILandingPadThunk that defines landingPad. assert(!mayNeedLandingPad || landingPad != nullptr); uint64_t s = mayNeedLandingPad - ? landingPad->getVA(0) + ? landingPad->getVA(ctx, 0) : getAArch64ThunkDestVA(ctx, destination, addend); memcpy(buf, data, sizeof(data)); ctx.target->relocateNoSym(buf + 8, R_AARCH64_ABS64, s); @@ -621,9 +621,9 @@ void AArch64ADRPThunk::writeLong(uint8_t *buf) { // AArch64BTILandingPadThunk that defines landingPad. assert(!mayNeedLandingPad || landingPad != nullptr); uint64_t s = mayNeedLandingPad - ? landingPad->getVA(0) + ? landingPad->getVA(ctx, 0) : getAArch64ThunkDestVA(ctx, destination, addend); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); memcpy(buf, data, sizeof(data)); ctx.target->relocateNoSym(buf, R_AARCH64_ADR_PREL_PG_HI21, getAArch64Page(s) - getAArch64Page(p)); @@ -656,8 +656,8 @@ bool AArch64BTILandingPadThunk::getMayUseShortThunk() { return false; // If the target is the following instruction then we can fall // through without the indirect branch. - uint64_t s = destination.getVA(addend); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t s = destination.getVA(ctx, addend); + uint64_t p = getThunkTargetSym()->getVA(ctx); // This function is called before addresses are stable. We need to // work out the range from the thunk to the next section but the // address of the start of the next section depends on the size of @@ -670,8 +670,8 @@ bool AArch64BTILandingPadThunk::getMayUseShortThunk() { } void AArch64BTILandingPadThunk::writeLong(uint8_t *buf) { - uint64_t s = destination.getVA(addend); - uint64_t p = getThunkTargetSym()->getVA() + 4; + uint64_t s = destination.getVA(ctx, addend); + uint64_t p = getThunkTargetSym()->getVA(ctx) + 4; write32(ctx, buf, 0xd503245f); // BTI c write32(ctx, buf + 4, 0x14000000); // B S ctx.target->relocateNoSym(buf + 4, R_AARCH64_CALL26, s - p); @@ -679,7 +679,7 @@ void AArch64BTILandingPadThunk::writeLong(uint8_t *buf) { // ARM Target Thunks static uint64_t getARMThunkDestVA(Ctx &ctx, const Symbol &s) { - uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(); + uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(ctx); return SignExtend64<32>(v); } @@ -693,7 +693,7 @@ bool ARMThunk::getMayUseShortThunk() { mayUseShortThunk = false; return false; } - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); int64_t offset = s - p - 8; mayUseShortThunk = llvm::isInt<26>(offset); return mayUseShortThunk; @@ -706,7 +706,7 @@ void ARMThunk::writeTo(uint8_t *buf) { } uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); int64_t offset = s - p - 8; write32(ctx, buf, 0xea000000); // b S ctx.target->relocateNoSym(buf, R_ARM_JUMP24, offset); @@ -736,7 +736,7 @@ bool ThumbThunk::getMayUseShortThunk() { mayUseShortThunk = false; return false; } - uint64_t p = getThunkTargetSym()->getVA() & ~1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~1; int64_t offset = s - p - 4; mayUseShortThunk = llvm::isInt<25>(offset); return mayUseShortThunk; @@ -749,7 +749,7 @@ void ThumbThunk::writeTo(uint8_t *buf) { } uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); int64_t offset = s - p - 4; write16(ctx, buf + 0, 0xf000); // b.w S write16(ctx, buf + 2, 0xb000); @@ -806,7 +806,7 @@ void ARMV7PILongThunk::writeLong(uint8_t *buf) { write32(ctx, buf + 8, 0xe08cc00f); // L1: add ip, ip, pc write32(ctx, buf + 12, 0xe12fff1c); // bx ip uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t p = getThunkTargetSym()->getVA(ctx); int64_t offset = s - p - 16; ctx.target->relocateNoSym(buf, R_ARM_MOVW_PREL_NC, offset); ctx.target->relocateNoSym(buf + 4, R_ARM_MOVT_PREL, offset); @@ -826,7 +826,7 @@ void ThumbV7PILongThunk::writeLong(uint8_t *buf) { write16(ctx, buf + 8, 0x44fc); // L1: add ip, pc write16(ctx, buf + 10, 0x4760); // bx ip uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; int64_t offset = s - p - 12; ctx.target->relocateNoSym(buf, R_ARM_THM_MOVW_PREL_NC, offset); ctx.target->relocateNoSym(buf + 4, R_ARM_THM_MOVT_PREL, offset); @@ -904,7 +904,7 @@ void ThumbV6MPILongThunk::writeLong(uint8_t *buf) { 0x46c0); // nop ; pad to 4-byte boundary write32(ctx, buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 4) uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); } @@ -992,7 +992,7 @@ void ARMV4PILongBXThunk::writeLong(uint8_t *buf) { write32(ctx, buf + 8, 0xe12fff1c); // bx ip write32(ctx, buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); } @@ -1009,7 +1009,7 @@ void ARMV4PILongThunk::writeLong(uint8_t *buf) { write32(ctx, buf + 4, 0xe08ff00c); // L1: add pc, pc, r12 write32(ctx, buf + 8, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; ctx.target->relocateNoSym(buf + 8, R_ARM_REL32, s - p - 12); } @@ -1029,7 +1029,7 @@ void ThumbV4PILongBXThunk::writeLong(uint8_t *buf) { write32(ctx, buf + 8, 0xe08cf00f); // L1: add pc, r12, pc write32(ctx, buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 16); } @@ -1051,7 +1051,7 @@ void ThumbV4PILongThunk::writeLong(uint8_t *buf) { write32(ctx, buf + 12, 0xe12fff1c); // bx ip write32(ctx, buf + 16, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(ctx, destination); - uint64_t p = getThunkTargetSym()->getVA() & ~0x1; + uint64_t p = getThunkTargetSym()->getVA(ctx) & ~0x1; ctx.target->relocateNoSym(buf + 16, R_ARM_REL32, s - p - 16); } @@ -1067,7 +1067,7 @@ void ThumbV4PILongThunk::addSymbols(ThunkSection &isec) { // Use the long jump which covers a range up to 8MiB. void AVRThunk::writeTo(uint8_t *buf) { write32(ctx, buf, 0x940c); // jmp func - ctx.target->relocateNoSym(buf, R_AVR_CALL, destination.getVA()); + ctx.target->relocateNoSym(buf, R_AVR_CALL, destination.getVA(ctx)); } void AVRThunk::addSymbols(ThunkSection &isec) { @@ -1077,7 +1077,7 @@ void AVRThunk::addSymbols(ThunkSection &isec) { // Write MIPS LA25 thunk code to call PIC function from the non-PIC one. void MipsThunk::writeTo(uint8_t *buf) { - uint64_t s = destination.getVA(); + uint64_t s = destination.getVA(ctx); write32(ctx, buf, 0x3c190000); // lui $25, %hi(func) write32(ctx, buf + 4, 0x08000000 | (s >> 2)); // j func write32(ctx, buf + 8, 0x27390000); // addiu $25, $25, %lo(func) @@ -1099,7 +1099,7 @@ InputSection *MipsThunk::getTargetInputSection() const { // Write microMIPS R2-R5 LA25 thunk code // to call PIC function from the non-PIC one. void MicroMipsThunk::writeTo(uint8_t *buf) { - uint64_t s = destination.getVA(); + uint64_t s = destination.getVA(ctx); write16(ctx, buf, 0x41b9); // lui $25, %hi(func) write16(ctx, buf + 4, 0xd400); // j func write16(ctx, buf + 8, 0x3339); // addiu $25, $25, %lo(func) @@ -1124,8 +1124,8 @@ InputSection *MicroMipsThunk::getTargetInputSection() const { // Write microMIPS R6 LA25 thunk code // to call PIC function from the non-PIC one. void MicroMipsR6Thunk::writeTo(uint8_t *buf) { - uint64_t s = destination.getVA(); - uint64_t p = getThunkTargetSym()->getVA(); + uint64_t s = destination.getVA(ctx); + uint64_t p = getThunkTargetSym()->getVA(ctx); write16(ctx, buf, 0x1320); // lui $25, %hi(func) write16(ctx, buf + 4, 0x3339); // addiu $25, $25, %lo(func) write16(ctx, buf + 8, 0x9400); // bc func @@ -1213,9 +1213,9 @@ void PPC32LongThunk::addSymbols(ThunkSection &isec) { void PPC32LongThunk::writeTo(uint8_t *buf) { auto ha = [](uint32_t v) -> uint16_t { return (v + 0x8000) >> 16; }; auto lo = [](uint32_t v) -> uint16_t { return v; }; - uint32_t d = destination.getVA(addend); + uint32_t d = destination.getVA(ctx, addend); if (ctx.arg.isPic) { - uint32_t off = d - (getThunkTargetSym()->getVA() + 8); + uint32_t off = d - (getThunkTargetSym()->getVA(ctx) + 8); write32(ctx, buf + 0, 0x7c0802a6); // mflr r12,0 write32(ctx, buf + 4, 0x429f0005); // bcl r20,r31,.+4 write32(ctx, buf + 8, 0x7d8802a6); // mtctr r12 @@ -1269,7 +1269,7 @@ void PPC64R2SaveStub::writeTo(uint8_t *buf) { write32(ctx, buf + 4, 0x48000000 | (offset & 0x03fffffc)); // b } else if (isInt<34>(offset)) { int nextInstOffset; - uint64_t tocOffset = destination.getVA() - getPPC64TocBase(ctx); + uint64_t tocOffset = destination.getVA(ctx) - getPPC64TocBase(ctx); if (tocOffset >> 16 > 0) { const uint64_t addi = ADDI_R12_TO_R12_NO_DISP | (tocOffset & 0xffff); const uint64_t addis = @@ -1306,8 +1306,8 @@ bool PPC64R2SaveStub::isCompatibleWith(const InputSection &isec, void PPC64R12SetupStub::writeTo(uint8_t *buf) { int64_t offset = - (gotPlt ? destination.getGotPltVA(ctx) : destination.getVA()) - - getThunkTargetSym()->getVA(); + (gotPlt ? destination.getGotPltVA(ctx) : destination.getVA(ctx)) - + getThunkTargetSym()->getVA(ctx); if (!isInt<34>(offset)) reportRangeError(ctx, buf, offset, 34, destination, "R12 setup stub offset"); @@ -1393,7 +1393,7 @@ static Thunk *addThunkAArch64(Ctx &ctx, RelType type, Symbol &s, int64_t a) { // TODO: use B for short Thumb->Arm thunks instead of LDR (this doesn't work for // Arm->Thumb, as in Arm state no BX PC trick; it doesn't switch state). static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) { - bool thumb_target = s.getVA(a) & 1; + bool thumb_target = s.getVA(ctx, a) & 1; switch (reloc) { case R_ARM_PC24: diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index c237a5f3793a..975954991cae 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1504,9 +1504,9 @@ template void Writer::finalizeAddressDependentContent() { // .rela.dyn. See also AArch64::relocate. if (part.relrAuthDyn) { auto it = llvm::remove_if( - part.relrAuthDyn->relocs, [&part](const RelativeReloc &elem) { + part.relrAuthDyn->relocs, [this, &part](const RelativeReloc &elem) { const Relocation &reloc = elem.inputSec->relocs()[elem.relocIdx]; - if (isInt<32>(reloc.sym->getVA(reloc.addend))) + if (isInt<32>(reloc.sym->getVA(ctx, reloc.addend))) return false; part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, elem.inputSec, reloc.offset, @@ -2713,7 +2713,7 @@ template void Writer::checkSections() { static uint64_t getEntryAddr(Ctx &ctx) { // Case 1, 2 or 3 if (Symbol *b = ctx.symtab->find(ctx.arg.entry)) - return b->getVA(); + return b->getVA(ctx); // Case 4 uint64_t addr; -- GitLab From 5d928ffce22d976b6594496f14351e00c2e4dd78 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 19 Oct 2024 21:02:03 -0700 Subject: [PATCH 179/511] [ELF] Remove error-prone RelocationBaseSection::classof --- lld/ELF/OutputSections.cpp | 6 +++++- lld/ELF/SyntheticSections.h | 8 +------- lld/ELF/Writer.cpp | 7 +++---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 6f76c5d73a53..6cae7cf8f859 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -882,7 +882,11 @@ void OutputSection::checkDynRelAddends(Ctx &ctx) { // for input .rel[a]. sections which we simply pass through to the // output. We skip over those and only look at the synthetic relocation // sections created during linking. - const auto *sec = dyn_cast(sections[i]); + if (!SyntheticSection::classof(sections[i]) || + !is_contained({ELF::SHT_REL, ELF::SHT_RELA, ELF::SHT_RELR}, + sections[i]->type)) + return; + const auto *sec = cast(sections[i]); if (!sec) return; for (const DynamicReloc &rel : sec->relocs) { diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index d64c4aad8c55..3573767671fe 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -547,13 +547,7 @@ public: void mergeRels(); void partitionRels(); void finalizeContents() override; - static bool classof(const SectionBase *d) { - return SyntheticSection::classof(d) && - (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL || - d->type == llvm::ELF::SHT_RELR || - (d->type == llvm::ELF::SHT_AARCH64_AUTH_RELR && - elf::ctx.arg.emachine == llvm::ELF::EM_AARCH64)); - } + int32_t dynamicTag, sizeDynamicTag; SmallVector relocs; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 975954991cae..ecd4f5e47083 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1690,10 +1690,9 @@ static void removeUnusedSyntheticSections(Ctx &ctx) { // finalizeAddressDependentContent, making .rela.dyn no longer empty. // Conservatively keep .rela.dyn. .relr.auth.dyn can be made empty, but // we would fail to remove it here. - if (ctx.arg.emachine == EM_AARCH64 && ctx.arg.relrPackDynRelocs) - if (auto *relSec = dyn_cast(sec)) - if (relSec == ctx.mainPart->relaDyn.get()) - return false; + if (ctx.arg.emachine == EM_AARCH64 && ctx.arg.relrPackDynRelocs && + sec == ctx.mainPart->relaDyn.get()) + return false; unused.insert(sec); return true; }); -- GitLab From e6625a2c106f6af468a98323b08c7ce3cf273485 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 19 Oct 2024 21:08:50 -0700 Subject: [PATCH 180/511] [ELF] Pass Ctx & --- lld/ELF/Arch/RISCV.cpp | 6 +++--- lld/ELF/InputSection.cpp | 4 +++- lld/ELF/MapFile.cpp | 4 ++-- lld/ELF/Symbols.cpp | 8 ++++---- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 7ebb67c36123..e80dfbd4351b 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -758,8 +758,8 @@ static void relaxCall(Ctx &ctx, const InputSection &sec, size_t i, uint64_t loc, } // Relax local-exec TLS when hi20 is zero. -static void relaxTlsLe(const InputSection &sec, size_t i, uint64_t loc, - Relocation &r, uint32_t &remove) { +static void relaxTlsLe(Ctx &ctx, const InputSection &sec, size_t i, + uint64_t loc, Relocation &r, uint32_t &remove) { uint64_t val = r.sym->getVA(ctx, r.addend); if (hi20(val) != 0) return; @@ -852,7 +852,7 @@ static bool relax(Ctx &ctx, InputSection &sec) { case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: if (relaxable(relocs, i)) - relaxTlsLe(sec, i, loc, r, remove); + relaxTlsLe(ctx, sec, i, loc, r, remove); break; case R_RISCV_HI20: case R_RISCV_LO12_I: diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 6c34471a9e50..3b48fbe07bb0 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -70,8 +70,10 @@ InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags, // If SHF_COMPRESSED is set, parse the header. The legacy .zdebug format is no // longer supported. - if (flags & SHF_COMPRESSED) + if (flags & SHF_COMPRESSED) { + Ctx &ctx = file->ctx; invokeELFT(parseCompressedHeader,); + } } // SHF_INFO_LINK and SHF_GROUP are normally resolved and not copied to the diff --git a/lld/ELF/MapFile.cpp b/lld/ELF/MapFile.cpp index 84bc95f9bd00..afaf04dc72fe 100644 --- a/lld/ELF/MapFile.cpp +++ b/lld/ELF/MapFile.cpp @@ -65,7 +65,7 @@ static std::vector getSymbols(Ctx &ctx) { } // Returns a map from sections to their symbols. -static SymbolMapTy getSectionSyms(ArrayRef syms) { +static SymbolMapTy getSectionSyms(Ctx &ctx, ArrayRef syms) { SymbolMapTy ret; for (Defined *dr : syms) ret[dr->section].emplace_back(dr, dr->getVA(ctx)); @@ -149,7 +149,7 @@ static void printEhFrame(Ctx &ctx, raw_ostream &os, const EhFrameSection *sec) { static void writeMapFile(Ctx &ctx, raw_fd_ostream &os) { // Collect symbol info that we want to print out. std::vector syms = getSymbols(ctx); - SymbolMapTy sectionSyms = getSectionSyms(syms); + SymbolMapTy sectionSyms = getSectionSyms(ctx, syms); DenseMap symStr = getSymbolStrings(ctx, syms); // Print out the header line. diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp index 6d9b3c839f86..da35bf858cb3 100644 --- a/lld/ELF/Symbols.cpp +++ b/lld/ELF/Symbols.cpp @@ -44,13 +44,13 @@ LLVM_ATTRIBUTE_UNUSED static inline void assertSymbols() { } // Returns a symbol for an error message. -static std::string maybeDemangleSymbol(StringRef symName) { - return elf::ctx.arg.demangle ? demangle(symName.str()) : symName.str(); +static std::string maybeDemangleSymbol(Ctx &ctx, StringRef symName) { + return ctx.arg.demangle ? demangle(symName.str()) : symName.str(); } std::string lld::toString(const elf::Symbol &sym) { StringRef name = sym.getName(); - std::string ret = maybeDemangleSymbol(name); + std::string ret = maybeDemangleSymbol(ctx, name); const char *suffix = sym.getVersionSuffix(); if (*suffix == '@') @@ -617,7 +617,7 @@ void Symbol::resolve(Ctx &ctx, const LazySymbol &other) { // For common objects, we want to look for global or weak definitions that // should be extracted as the canonical definition instead. - if (LLVM_UNLIKELY(isCommon()) && elf::ctx.arg.fortranCommon && + if (LLVM_UNLIKELY(isCommon()) && ctx.arg.fortranCommon && other.file->shouldExtractForCommon(getName())) { ctx.backwardReferences.erase(this); other.overwrite(*this); -- GitLab From 11dad2fa5138a50d60a5a34a2c7e074b976820e2 Mon Sep 17 00:00:00 2001 From: Pranav Bhandarkar Date: Sun, 20 Oct 2024 01:01:39 -0500 Subject: [PATCH 181/511] [flang][OpenMP] - Add `MapInfoOp` instances for target private variables when needed (#109862) This PR adds an OpenMP dialect related pass for FIR/HLFIR which creates `MapInfoOp` instances for certain privatized symbols. For example, if an allocatable variable is used in a private clause attached to a `omp.target` op, then the allocatable variable's descriptor will be needed on the device (e.g. GPU). This descriptor needs to be separately mapped onto the device. This pass creates the necessary `omp.map.info` ops for this. --- .../include/flang/Optimizer/OpenMP/Passes.td | 13 ++ flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + .../OpenMP/MapsForPrivatizedSymbols.cpp | 156 ++++++++++++++++++ flang/lib/Optimizer/Passes/Pipelines.cpp | 1 + .../target-private-allocatable.f90 | 20 ++- .../target-private-multiple-variables.f90 | 24 ++- .../omp-maps-for-privatized-symbols.fir | 48 ++++++ mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 2 +- 8 files changed, 251 insertions(+), 14 deletions(-) create mode 100644 flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp create mode 100644 flang/test/Transforms/omp-maps-for-privatized-symbols.fir diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 1c0ce08f5b48..c070bc22ff20 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -22,6 +22,19 @@ def MapInfoFinalizationPass let dependentDialects = ["mlir::omp::OpenMPDialect"]; } +def MapsForPrivatizedSymbolsPass + : Pass<"omp-maps-for-privatized-symbols", "mlir::func::FuncOp"> { + let summary = "Creates MapInfoOp instances for privatized symbols when needed"; + let description = [{ + Adds omp.map.info operations for privatized symbols on omp.target ops + In certain situations, such as when an allocatable is privatized, its + descriptor is needed in the alloc region of the privatizer. This results + in the use of the descriptor inside the target region. As such, the + descriptor then needs to be mapped. This pass adds such MapInfoOp operations. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + def MarkDeclareTargetPass : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { let summary = "Marks all functions called by an OpenMP declare target function as declare target"; diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 92051634f037..035d0d5ca46c 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_flang_library(FlangOpenMPTransforms FunctionFiltering.cpp + MapsForPrivatizedSymbols.cpp MapInfoFinalization.cpp MarkDeclareTarget.cpp diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp new file mode 100644 index 000000000000..2fa55844aec7 --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp @@ -0,0 +1,156 @@ +//===- MapsForPrivatizedSymbols.cpp +//-----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +/// \file +/// An OpenMP dialect related pass for FIR/HLFIR which creates MapInfoOp +/// instances for certain privatized symbols. +/// For example, if an allocatable variable is used in a private clause attached +/// to a omp.target op, then the allocatable variable's descriptor will be +/// needed on the device (e.g. GPU). This descriptor needs to be separately +/// mapped onto the device. This pass creates the necessary omp.map.info ops for +/// this. +//===----------------------------------------------------------------------===// +// TODO: +// 1. Before adding omp.map.info, check if we already have an omp.map.info for +// the variable in question. +// 2. Generalize this for more than just omp.target ops. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/OpenMP/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Pass/Pass.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "omp-maps-for-privatized-symbols" + +namespace flangomp { +#define GEN_PASS_DEF_MAPSFORPRIVATIZEDSYMBOLSPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp +using namespace mlir; +namespace { +class MapsForPrivatizedSymbolsPass + : public flangomp::impl::MapsForPrivatizedSymbolsPassBase< + MapsForPrivatizedSymbolsPass> { + + bool privatizerNeedsMap(omp::PrivateClauseOp &privatizer) { + Region &allocRegion = privatizer.getAllocRegion(); + Value blockArg0 = allocRegion.getArgument(0); + if (blockArg0.use_empty()) + return false; + return true; + } + omp::MapInfoOp createMapInfo(Location loc, Value var, + fir::FirOpBuilder &builder) { + uint64_t mapTypeTo = static_cast< + std::underlying_type_t>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); + Operation *definingOp = var.getDefiningOp(); + auto declOp = llvm::dyn_cast_or_null(definingOp); + assert(declOp && + "Expected defining Op of privatized var to be hlfir.declare"); + + // We want the first result of the hlfir.declare op because our goal + // is to map the descriptor (fir.box or fir.boxchar) and the first + // result for hlfir.declare is the descriptor if a the symbol being + // decalred needs a descriptor. + Value varPtr = declOp.getBase(); + + // If we do not have a reference to descritor, but the descriptor itself + // then we need to store that on the stack so that we can map the + // address of the descriptor. + if (mlir::isa(varPtr.getType()) || + mlir::isa(varPtr.getType())) { + OpBuilder::InsertPoint savedInsPoint = builder.saveInsertionPoint(); + mlir::Block *allocaBlock = builder.getAllocaBlock(); + assert(allocaBlock && "No allocablock found for a funcOp"); + builder.setInsertionPointToStart(allocaBlock); + auto alloca = builder.create(loc, varPtr.getType()); + builder.restoreInsertionPoint(savedInsPoint); + builder.create(loc, varPtr, alloca); + varPtr = alloca; + } + return builder.create( + loc, varPtr.getType(), varPtr, + TypeAttr::get(llvm::cast(varPtr.getType()) + .getElementType()), + /*varPtrPtr=*/Value{}, + /*members=*/SmallVector{}, + /*member_index=*/DenseIntElementsAttr{}, + /*bounds=*/ValueRange{}, + builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false), + mapTypeTo), + builder.getAttr( + omp::VariableCaptureKind::ByRef), + StringAttr(), builder.getBoolAttr(false)); + } + void addMapInfoOp(omp::TargetOp targetOp, omp::MapInfoOp mapInfoOp) { + auto argIface = llvm::cast(*targetOp); + unsigned insertIndex = + argIface.getMapBlockArgsStart() + argIface.numMapBlockArgs(); + targetOp.getMapVarsMutable().append(ValueRange{mapInfoOp}); + targetOp.getRegion().insertArgument(insertIndex, mapInfoOp.getType(), + mapInfoOp.getLoc()); + } + void addMapInfoOps(omp::TargetOp targetOp, + llvm::SmallVectorImpl &mapInfoOps) { + for (auto mapInfoOp : mapInfoOps) + addMapInfoOp(targetOp, mapInfoOp); + } + void runOnOperation() override { + ModuleOp module = getOperation()->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(module); + fir::FirOpBuilder builder{module, std::move(kindMap)}; + llvm::DenseMap> + mapInfoOpsForTarget; + + getOperation()->walk([&](omp::TargetOp targetOp) { + if (targetOp.getPrivateVars().empty()) + return; + OperandRange privVars = targetOp.getPrivateVars(); + std::optional privSyms = targetOp.getPrivateSyms(); + SmallVector mapInfoOps; + for (auto [privVar, privSym] : llvm::zip_equal(privVars, *privSyms)) { + + SymbolRefAttr privatizerName = llvm::cast(privSym); + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom( + targetOp, privatizerName); + if (!privatizerNeedsMap(privatizer)) { + continue; + } + builder.setInsertionPoint(targetOp); + Location loc = targetOp.getLoc(); + omp::MapInfoOp mapInfoOp = createMapInfo(loc, privVar, builder); + mapInfoOps.push_back(mapInfoOp); + LLVM_DEBUG(llvm::dbgs() << "MapsForPrivatizedSymbolsPass created ->\n"); + LLVM_DEBUG(mapInfoOp.dump()); + } + if (!mapInfoOps.empty()) { + mapInfoOpsForTarget.insert({targetOp.getOperation(), mapInfoOps}); + } + }); + if (!mapInfoOpsForTarget.empty()) { + for (auto &[targetOp, mapInfoOps] : mapInfoOpsForTarget) { + addMapInfoOps(static_cast(targetOp), mapInfoOps); + } + } + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 3fa5c54403bd..3c139f7e9340 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -243,6 +243,7 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, /// rather than the host device. void createOpenMPFIRPassPipeline(mlir::PassManager &pm, bool isTargetDevice) { pm.addPass(flangomp::createMapInfoFinalizationPass()); + pm.addPass(flangomp::createMapsForPrivatizedSymbolsPass()); pm.addPass(flangomp::createMarkDeclareTargetPass()); if (isTargetDevice) pm.addPass(flangomp::createFunctionFilteringPass()); diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 index a27de1152ce1..e11525c569ff 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 @@ -18,22 +18,22 @@ end subroutine target_allocatable ! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : ! CHECK-SAME: [[TYPE:!fir.ref>>]] alloc { ! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]): -! CHECK: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box> {bindc_name = "alloc_var", {{.*}}} +! CHECK: %[[PRIV_ALLOC:.*]] = fir.alloca [[DESC_TYPE:!fir.box>]] {bindc_name = "alloc_var", {{.*}}} -! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref>> -! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box>) -> !fir.heap +! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]] +! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap ! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap) -> i64 ! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64 ! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64 ! CHECK-NEXT: fir.if %[[ALLOC_COND]] { ! CHECK: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, {{.*}}} -! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> !fir.box> -! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref>> +! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> [[DESC_TYPE]] +! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] ! CHECK-NEXT: } else { ! CHECK-NEXT: %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap -! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap) -> !fir.box> -! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref>> +! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap) -> [[DESC_TYPE]] +! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] ! CHECK-NEXT: } ! CHECK-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] @@ -63,9 +63,11 @@ end subroutine target_allocatable ! CHECK-LABEL: func.func @_QPtarget_allocatable() { -! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca !fir.box> +! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]] ! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}} ! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] -! CHECK: omp.target private( +! CHECK: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) +! CHECK-SAME: map_clauses(to) capture(ByRef) -> [[TYPE]] +! CHECK: omp.target map_entries(%[[MAP_VAR]] -> %arg0 : [[TYPE]]) private( ! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} : [[TYPE]]) { diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 index ce98f518581a..b0c76ff3845f 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 @@ -147,12 +147,29 @@ end subroutine target_allocatable ! CHECK-NEXT: } ! CHECK: func.func @_QPtarget_allocatable +! CHECK: %[[CHAR_VAR_DESC_ALLOCA:.*]] = fir.alloca !fir.boxchar<1> +! CHECK: %[[REAL_ARR_DESC_ALLOCA:.*]] = fir.alloca !fir.box> +! CHECK: %[[ALLOC_VAR_ALLOCA:.*]] = fir.alloca !fir.box> {bindc_name = "alloc_var", {{.*}}} +! CHECK: %[[ALLOC_VAR_DECL:.*]]:2 = hlfir.declare %[[ALLOC_VAR_ALLOCA]] ! CHECK: %[[MAPPED_ALLOC:.*]] = fir.alloca i32 {bindc_name = "mapped_var", {{.*}}} ! CHECK-NEXT: %[[MAPPED_DECL:.*]]:2 = hlfir.declare %[[MAPPED_ALLOC]] -! CHECK: %[[MAPPED_MI:.*]] = omp.map.info var_ptr(%[[MAPPED_DECL]]#1 : !fir.ref, i32) - +! CHECK: %[[CHAR_VAR_ALLOC:.*]] = fir.alloca !fir.char<1,?>{{.*}} {bindc_name = "char_var", {{.*}}} +! CHECK: %[[CHAR_VAR_DECL:.*]]:2 = hlfir.declare %[[CHAR_VAR_ALLOC]] typeparams +! CHECK: %[[REAL_ARR_ALLOC:.*]] = fir.alloca !fir.array, {{.*}} {bindc_name = "real_arr", {{.*}}} +! CHECK: %[[REAL_ARR_DECL:.*]]:2 = hlfir.declare %[[REAL_ARR_ALLOC]]({{.*}}) +! CHECK: %[[MAPPED_MI0:.*]] = omp.map.info var_ptr(%[[MAPPED_DECL]]#1 : !fir.ref, i32) {{.*}} +! CHECK: %[[ALLOC_VAR_MAP:.*]] = omp.map.info var_ptr(%[[ALLOC_VAR_DECL]]#0 : !fir.ref>>, !fir.box>) +! CHECK: fir.store %[[REAL_ARR_DECL]]#0 to %[[REAL_ARR_DESC_ALLOCA]] : !fir.ref>> +! CHECK: %[[REAL_ARR_DESC_MAP:.*]] = omp.map.info var_ptr(%[[REAL_ARR_DESC_ALLOCA]] : !fir.ref>>, !fir.box>) +! CHECK: fir.store %[[CHAR_VAR_DECL]]#0 to %[[CHAR_VAR_DESC_ALLOCA]] : !fir.ref> +! CHECK: %[[CHAR_VAR_DESC_MAP:.*]] = omp.map.info var_ptr(%[[CHAR_VAR_DESC_ALLOCA]] : !fir.ref>, !fir.boxchar<1>) ! CHECK: omp.target -! CHECK-SAME: map_entries(%[[MAPPED_MI]] -> %[[MAPPED_ARG:.*]] : !fir.ref) +! CHECK-SAME: map_entries( +! CHECK-SAME: %[[MAPPED_MI0]] -> %[[MAPPED_ARG0:[^,]+]], +! CHECK-SAME: %[[ALLOC_VAR_MAP]] -> %[[MAPPED_ARG1:[^,]+]] +! CHECK-SAME %[[REAL_ARR_DESC_MAP]] -> %[[MAPPED_ARG2:[^,]+]] +! CHECK_SAME %[[CHAR_VAR_DESC_MAP]] -> %[[MAPPED_ARG3:.[^,]+]] : +! CHECK-SAME !fir.ref, !fir.ref>>, !fir.ref>>, !fir.ref>) ! CHECK-SAME: private( ! CHECK-SAME: @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:[^,]+]], ! CHECK-SAME: @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:[^,]+]], @@ -162,7 +179,6 @@ end subroutine target_allocatable ! CHECK-SAME: @[[CHAR_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[CHAR_ARG:[^,]+]] : ! CHECK-SAME: !fir.ref>>, !fir.ref, !fir.ref, !fir.box>, !fir.ref>, !fir.boxchar<1>) { ! CHECK-NOT: fir.alloca -! CHECK: hlfir.declare %[[MAPPED_ARG]] ! CHECK: hlfir.declare %[[ALLOC_ARG]] ! CHECK: hlfir.declare %[[REAL_ARG]] ! CHECK: hlfir.declare %[[LB_ARG]] diff --git a/flang/test/Transforms/omp-maps-for-privatized-symbols.fir b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir new file mode 100644 index 000000000000..d32444aaabf2 --- /dev/null +++ b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir @@ -0,0 +1,48 @@ +// RUN: fir-opt --split-input-file --omp-maps-for-privatized-symbols %s | FileCheck %s +module attributes {omp.is_target_device = false} { + omp.private {type = private} @_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 : !fir.ref>> alloc { + ^bb0(%arg0: !fir.ref>>): + %0 = fir.alloca !fir.box> {bindc_name = "simple_var", pinned, uniq_name = "_QFtarget_simpleEsimple_var"} + %1 = fir.load %arg0 : !fir.ref>> + %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + omp.yield(%5#0 : !fir.ref>>) + } + func.func @_QPtarget_simple() { + %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFtarget_simpleEa"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %2 = fir.alloca !fir.box> {bindc_name = "simple_var", uniq_name = "_QFtarget_simpleEsimple_var"} + %3 = fir.zero_bits !fir.heap + %4 = fir.embox %3 : (!fir.heap) -> !fir.box> + fir.store %4 to %2 : !fir.ref>> + %5:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + %c2_i32 = arith.constant 2 : i32 + hlfir.assign %c2_i32 to %1#0 : i32, !fir.ref + %6 = omp.map.info var_ptr(%1#1 : !fir.ref, i32) map_clauses(to) capture(ByRef) -> !fir.ref {name = "a"} + omp.target map_entries(%6 -> %arg0 : !fir.ref) private(@_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 %5#0 -> %arg1 : !fir.ref>>) { + %11:2 = hlfir.declare %arg0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %12:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + %c10_i32 = arith.constant 10 : i32 + %13 = fir.load %11#0 : !fir.ref + %14 = arith.addi %c10_i32, %13 : i32 + hlfir.assign %14 to %12#0 realloc : i32, !fir.ref>> + omp.terminator + } + %7 = fir.load %5#1 : !fir.ref>> + %8 = fir.box_addr %7 : (!fir.box>) -> !fir.heap + %9 = fir.convert %8 : (!fir.heap) -> i64 + %c0_i64 = arith.constant 0 : i64 + %10 = arith.cmpi ne, %9, %c0_i64 : i64 + fir.if %10 { + %11 = fir.load %5#1 : !fir.ref>> + %12 = fir.box_addr %11 : (!fir.box>) -> !fir.heap + fir.freemem %12 : !fir.heap + %13 = fir.zero_bits !fir.heap + %14 = fir.embox %13 : (!fir.heap) -> !fir.box> + fir.store %14 to %5#1 : !fir.ref>> + } + return + } +} +// CHECK: %[[MAP0:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref, i32) map_clauses(to) capture(ByRef) -> !fir.ref {name = "a"} +// CHECK: %[[MAP1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) -> !fir.ref>> +// CHECK: omp.target map_entries(%[[MAP0]] -> %arg0, %[[MAP1]] -> %arg1 : !fir.ref, !fir.ref>>) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 45313200d4f0..626539cb7bde 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -948,7 +948,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> { objects (e.g. derived types or classes), indicates the bounds to be copied of the variable. When it's an array slice it is in rank order where rank 0 is the inner-most dimension. - - 'map_clauses': OpenMP map type for this map capture, for example: from, to and + - 'map_type': OpenMP map type for this map capture, for example: from, to and always. It's a bitfield composed of the OpenMP runtime flags stored in OpenMPOffloadMappingFlags. - 'map_capture_type': Capture type for the variable e.g. this, byref, byvalue, byvla -- GitLab From e6c01432b6fb6077e1bdf2e0abf05d2c2dd3fd3e Mon Sep 17 00:00:00 2001 From: OverMighty Date: Sun, 20 Oct 2024 09:49:19 +0200 Subject: [PATCH 182/511] [libc][math][c23] Update newhdrgen for new _Float16 math functions (#113005) --- libc/newhdrgen/yaml/math.yaml | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml index 98ea1a0d25fb..fe07803cff06 100644 --- a/libc/newhdrgen/yaml/math.yaml +++ b/libc/newhdrgen/yaml/math.yaml @@ -206,6 +206,13 @@ functions: return_type: float arguments: - type: float + - name: coshf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: ddivl standards: - stdc @@ -266,6 +273,13 @@ functions: return_type: float arguments: - type: float + - name: exp10m1f16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: exp2 standards: - stdc @@ -1557,6 +1571,13 @@ functions: return_type: float arguments: - type: float + - name: log10f16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: log1p standards: - stdc @@ -1581,6 +1602,13 @@ functions: return_type: float arguments: - type: float + - name: log2f16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: logb standards: - stdc @@ -1619,6 +1647,13 @@ functions: return_type: float arguments: - type: float + - name: logf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: lrint standards: - stdc @@ -2297,6 +2332,13 @@ functions: return_type: float arguments: - type: float + - name: sinhf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: sinpif16 standards: - stdc @@ -2323,6 +2365,13 @@ functions: arguments: - type: float128 guard: LIBC_TYPES_HAS_FLOAT128 + - name: sqrtf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: sqrtl standards: - stdc @@ -2347,6 +2396,13 @@ functions: return_type: float arguments: - type: float + - name: tanhf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: totalorder standards: - stdc -- GitLab From ba1255def64a9c3c68d97ace051eec76f546eeb0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 20 Oct 2024 13:05:23 +0100 Subject: [PATCH 183/511] [DAG] Use FoldConstantArithmetic to constant fold (and (ext (and V, c1)), c2) -> (and (ext V), (and c1, (ext c2))) Noticed while triaging the regression from #112710 noticed by @mstorsjo - don't rely on isConstantIntBuildVectorOrConstantInt+getNode to guarantee constant folding (if it fails to constant fold it will infinite loop), use FoldConstantArithmetic instead. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f89734fb43e9..2527bb269643 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7159,15 +7159,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0Op0 = N0.getOperand(0); if (N0Op0.getOpcode() == ISD::AND && (ExtOpc != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0Op0, VT)) && - DAG.isConstantIntBuildVectorOrConstantInt(N1) && - DAG.isConstantIntBuildVectorOrConstantInt(N0Op0.getOperand(1)) && N0->hasOneUse() && N0Op0->hasOneUse()) { - SDValue NewMask = - DAG.getNode(ISD::AND, DL, VT, N1, - DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(1))); - return DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(0)), - NewMask); + if (SDValue NewExt = DAG.FoldConstantArithmetic(ExtOpc, DL, VT, + {N0Op0.getOperand(1)})) { + if (SDValue NewMask = + DAG.FoldConstantArithmetic(ISD::AND, DL, VT, {N1, NewExt})) { + return DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(0)), + NewMask); + } + } } } -- GitLab From 94cddcfc1ca21958add4355653872e8eea2557b7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 20 Oct 2024 13:53:26 +0100 Subject: [PATCH 184/511] [ARM] Add reduced regression test for infinite-loop due to #112710 --- llvm/test/CodeGen/ARM/pr112710.ll | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 llvm/test/CodeGen/ARM/pr112710.ll diff --git a/llvm/test/CodeGen/ARM/pr112710.ll b/llvm/test/CodeGen/ARM/pr112710.ll new file mode 100644 index 000000000000..006d564a6d3a --- /dev/null +++ b/llvm/test/CodeGen/ARM/pr112710.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-- | FileCheck %s + +; Reduced regression test for infinite-loop due to #112710 +define void @test(i32 %bf.load.i) { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: vldr d16, .LCPI0_0 +; CHECK-NEXT: vmov.i64 q9, #0xffff +; CHECK-NEXT: vdup.32 d17, r0 +; CHECK-NEXT: vneg.s32 d16, d16 +; CHECK-NEXT: vshl.u32 d16, d17, d16 +; CHECK-NEXT: vldr d17, .LCPI0_1 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmovl.u32 q8, d16 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bl use +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 4095 @ 0xfff +; CHECK-NEXT: .long 1 @ 0x1 +entry: + %0 = insertelement <2 x i32> poison, i32 %bf.load.i, i64 0 + %1 = shufflevector <2 x i32> %0, <2 x i32> poison, <2 x i32> zeroinitializer + %2 = lshr <2 x i32> %1, + %arrayinit.element1.i = getelementptr inbounds i8, ptr poison, i32 16 + %3 = trunc <2 x i32> %2 to <2 x i16> + %4 = and <2 x i16> %3, + %5 = zext nneg <2 x i16> %4 to <2 x i64> + store <2 x i64> %5, ptr %arrayinit.element1.i, align 8 + call void @use() + unreachable +} +declare void @use() -- GitLab From f0b3b6d15b2c0ee2cff2dd31dc075adb5d9a4ff7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 20 Oct 2024 14:23:21 +0100 Subject: [PATCH 185/511] [DAG] isConstantIntBuildVectorOrConstantInt - peek through bitcasts (#112710) (REAPPLIED) Alter both isConstantIntBuildVectorOrConstantInt + isConstantFPBuildVectorOrConstantFP to return a bool instead of the underlying SDNode, and adjust usage to account for this. Update isConstantIntBuildVectorOrConstantInt to peek though bitcasts when attempting to find a constant, in particular this improves canonicalization of constants to the RHS on commutable instructions. X86 is the beneficiary here as it often bitcasts rematerializable 0/-1 vector constants as vXi32 and bitcasts to the requested type Minor cleanup that helps with #107423 Reapplied after regression fix ba1255def64a9c3c68d97ace051eec76f546eeb0 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 5 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 +-- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 47 +-- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/test/CodeGen/X86/avx2-arith.ll | 2 +- llvm/test/CodeGen/X86/combine-sra.ll | 9 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 50 ++-- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 60 ++-- .../CodeGen/X86/min-legal-vector-width.ll | 18 +- llvm/test/CodeGen/X86/pmul.ll | 62 ++-- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 2 +- llvm/test/CodeGen/X86/psubus.ll | 81 +++-- llvm/test/CodeGen/X86/sat-add.ll | 4 +- .../X86/vector-shuffle-combining-sse41.ll | 6 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 275 ++++++++--------- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 275 ++++++++--------- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 279 ++++++++---------- 18 files changed, 570 insertions(+), 650 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b8f80738486a..12ff36c89e33 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2301,10 +2301,11 @@ public: Align getEVTAlign(EVT MemoryVT) const; /// Test whether the given value is a constant int or similar node. - SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N) const; + bool isConstantIntBuildVectorOrConstantInt(SDValue N, + bool AllowOpaques = true) const; /// Test whether the given value is a constant FP or similar node. - SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) const ; + bool isConstantFPBuildVectorOrConstantFP(SDValue N) const; /// \returns true if \p N is any kind of constant or build_vector of /// constants, int or float. If a vector, it may not necessarily be a splat. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2527bb269643..50a75bc5932c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1205,13 +1205,13 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) { + if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) { SDNodeFlags NewFlags; if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) NewFlags.setNoUnsignedWrap(true); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) { + if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1})) return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags); @@ -9932,10 +9932,10 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { // fold (rot* (rot* x, c2), c1) // -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { - SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); - SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); - if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { - EVT ShiftVT = C1->getValueType(0); + bool C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); + bool C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); + if (C1 && C2 && N1.getValueType() == N0.getOperand(1).getValueType()) { + EVT ShiftVT = N1.getValueType(); bool SameSide = (N->getOpcode() == NextOp); unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); @@ -16807,8 +16807,8 @@ SDValue DAGCombiner::visitVP_FADD(SDNode *N) { SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); - SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); + bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); + bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -16905,10 +16905,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { - SDNode *CFP00 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); - SDNode *CFP01 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { @@ -16928,10 +16926,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FMUL) { - SDNode *CFP10 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); - SDNode *CFP11 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { @@ -16951,8 +16947,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N0.getOpcode() == ISD::FADD) { - SDNode *CFP00 = - DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { @@ -16962,8 +16957,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } if (N1.getOpcode() == ISD::FADD) { - SDNode *CFP10 = - DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4b6477957063..55cebc28e492 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7000,10 +7000,10 @@ void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1, // Canonicalize: // binop(const, nonconst) -> binop(nonconst, const) - SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1); - SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2); - SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1); - SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2); + bool N1C = isConstantIntBuildVectorOrConstantInt(N1); + bool N2C = isConstantIntBuildVectorOrConstantInt(N2); + bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); + bool N2CFP = isConstantFPBuildVectorOrConstantFP(N2); if ((N1C && !N2C) || (N1CFP && !N2CFP)) std::swap(N1, N2); @@ -13200,39 +13200,44 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { return true; } -// Returns the SDNode if it is a constant integer BuildVector -// or constant integer. -SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const { - if (isa(N)) - return N.getNode(); +// Returns true if it is a constant integer BuildVector or constant integer, +// possibly hidden by a bitcast. +bool SelectionDAG::isConstantIntBuildVectorOrConstantInt( + SDValue N, bool AllowOpaques) const { + N = peekThroughBitcasts(N); + + if (auto *C = dyn_cast(N)) + return AllowOpaques || !C->isOpaque(); + if (ISD::isBuildVectorOfConstantSDNodes(N.getNode())) - return N.getNode(); + return true; + // Treat a GlobalAddress supporting constant offset folding as a // constant integer. - if (GlobalAddressSDNode *GA = dyn_cast(N)) + if (auto *GA = dyn_cast(N)) if (GA->getOpcode() == ISD::GlobalAddress && TLI->isOffsetFoldingLegal(GA)) - return GA; + return true; + if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return N.getNode(); - return nullptr; + return true; + return false; } -// Returns the SDNode if it is a constant float BuildVector -// or constant float. -SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { +// Returns true if it is a constant float BuildVector or constant float. +bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const { if (isa(N)) - return N.getNode(); + return true; if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) - return N.getNode(); + return true; if ((N.getOpcode() == ISD::SPLAT_VECTOR) && isa(N.getOperand(0))) - return N.getNode(); + return true; - return nullptr; + return false; } std::optional SelectionDAG::isBoolConstant(SDValue N, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d5466e0a1cbd..7448416c682a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20760,7 +20760,7 @@ static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) { if (!Add.hasOneUse()) return SDValue(); - if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X))) + if (DAG.isConstantIntBuildVectorOrConstantInt(X)) return SDValue(); SDValue M1 = Add.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 08321024fb65..bcb84add65d8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56546,14 +56546,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); - // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { - if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) { - if (auto *Cst = dyn_cast(C)) - return !Cst->isOpaque(); - return true; - } - return false; + return DAG.isConstantIntBuildVectorOrConstantInt(Op, + /*AllowOpaques*/ false); }; // X86 can't encode an immediate LHS of a sub. See if we can push the diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 90733dfb8465..44ab33ad67f2 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 ; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 ; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index 7eee418742dd..c982884314f6 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -725,12 +725,11 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519] -; SSE41-NEXT: movdqa %xmm8, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 6fd3db3464de..ee83a79b6dd5 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 ; SSE41-NEXT: pand %xmm2, %xmm5 @@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3139,7 +3139,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3287,8 +3287,8 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3311,7 +3311,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3356,7 +3356,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3376,7 +3376,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 5a1c4c8a52c8..b4e8f0a23016 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1914,7 +1914,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -1922,7 +1922,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -1944,7 +1944,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1974,14 +1974,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -1999,7 +1999,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2088,7 +2088,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 @@ -2096,7 +2096,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2120,7 +2120,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2150,14 +2150,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2176,7 +2176,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2266,7 +2266,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2274,7 +2274,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2297,7 +2297,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2328,14 +2328,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2354,7 +2354,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2444,7 +2444,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2452,7 +2452,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2475,7 +2475,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2506,14 +2506,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2532,7 +2532,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2623,7 +2623,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2631,7 +2631,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2655,7 +2655,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2687,14 +2687,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2714,7 +2714,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 8289e885618f..9b08d8baacee 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -892,13 +892,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -913,13 +913,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 @@ -939,13 +939,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -967,7 +967,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 @@ -980,7 +980,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 @@ -997,7 +997,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 6c3d04863118..fe8a4fa16312 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -161,8 +161,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE41-NEXT: pand %xmm2, %xmm4 @@ -586,17 +586,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 ; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm5 @@ -609,7 +608,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -621,7 +620,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -902,37 +901,34 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pand %xmm4, %xmm9 +; SSE41-NEXT: pandn %xmm4, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm9, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 ; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pandn %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 @@ -945,14 +941,14 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 @@ -963,28 +959,28 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 ; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index c9bb3de92dcd..885b07585e68 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -59,7 +59,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index be8adf697d5c..9656822d144e 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1671,12 +1671,11 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pxor %xmm9, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 @@ -1684,22 +1683,20 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] ; SSE41-NEXT: movapd %xmm8, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2771,12 +2768,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pxor %xmm10, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -2784,11 +2780,10 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -2797,11 +2792,10 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm7 ; SSE41-NEXT: psubd %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2997,12 +2991,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pxor %xmm10, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 @@ -3010,11 +3003,10 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] ; SSE41-NEXT: movapd %xmm9, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -3023,11 +3015,10 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index 949902a5ebc4..b12be7cb129d 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -631,8 +631,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index efe34c52b371..d3e4906450e4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -84,8 +84,8 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 @@ -120,7 +120,7 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 5568604ac29a..0af5e9aeccd9 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -57,8 +57,8 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -175,8 +175,8 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -317,12 +317,12 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -330,8 +330,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -584,35 +584,32 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -620,8 +617,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -828,8 +825,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -971,8 +968,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1143,12 +1140,12 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1156,8 +1153,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1333,12 +1330,12 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1346,8 +1343,8 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1583,35 +1580,32 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1619,8 +1613,8 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -2239,8 +2233,8 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2393,8 +2387,8 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2539,12 +2533,12 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2552,8 +2546,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2733,12 +2727,12 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2746,8 +2740,8 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2987,35 +2981,32 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3023,8 +3014,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3277,35 +3268,32 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3313,8 +3301,8 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3677,79 +3665,72 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm9, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm2, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: pxor %xmm2, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3757,8 +3738,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d276a6873012..3c03c521c272 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -59,8 +59,8 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -182,8 +182,8 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -334,12 +334,12 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -347,8 +347,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -604,35 +604,32 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm3, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -640,8 +637,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -849,8 +846,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -983,8 +980,8 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -1149,12 +1146,12 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1162,8 +1159,8 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1333,12 +1330,12 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -1346,8 +1343,8 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -1579,35 +1576,32 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -1615,8 +1609,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -2002,8 +1996,8 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2148,8 +2142,8 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -2288,12 +2282,12 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2301,8 +2295,8 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2476,12 +2470,12 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 @@ -2489,8 +2483,8 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2726,35 +2720,32 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm2, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -2762,8 +2753,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3022,35 +3013,32 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 @@ -3058,8 +3046,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 @@ -3430,79 +3418,72 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm9, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm2, %xmm12 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: pxor %xmm2, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm13, %xmm0 @@ -3510,8 +3491,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 412661693747..c1d22dc7daf2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -207,20 +207,20 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -407,34 +407,31 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -790,26 +787,25 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -924,26 +920,25 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 +; SSE41-NEXT: packusdw %xmm6, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rdi) ; SSE41-NEXT: retq @@ -1094,34 +1089,31 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -1869,26 +1861,25 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -2005,26 +1996,25 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 +; SSE41-NEXT: packusdw %xmm6, %xmm4 ; SSE41-NEXT: packusdw %xmm4, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm4 ; SSE41-NEXT: movd %xmm4, (%rdi) @@ -2175,34 +2165,31 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2360,34 +2347,31 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: packusdw %xmm9, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 @@ -2602,44 +2586,40 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm11 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm13 +; SSE41-NEXT: pxor %xmm7, %xmm13 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm13 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 ; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pxor %xmm7, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm12 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm12 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm10, %xmm11 +; SSE41-NEXT: pxor %xmm7, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm11 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm11, %xmm0 @@ -2647,32 +2627,29 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 ; SSE41-NEXT: packusdw %xmm12, %xmm11 ; SSE41-NEXT: packusdw %xmm11, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pxor %xmm7, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm7, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 ; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 -- GitLab From aa7f377c965ca79cf3022ddafe6cbd419bd52db5 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Sun, 20 Oct 2024 10:01:21 -0400 Subject: [PATCH 186/511] [libc++] Remove libc++'s own stdint.h and locale.h (#107436) These headers are not doing anything beyond the system or compiler provided equivalent headers, so there's no real reason to keep them around. Reducing the number of C headers we provide in libc++ simplifies our header layering and reduces the potential for confusion when headers are layered incorrectly. --- libcxx/include/CMakeLists.txt | 2 - libcxx/include/clocale | 8 -- libcxx/include/cstdint | 8 -- libcxx/include/locale.h | 46 ------- libcxx/include/module.modulemap | 8 -- libcxx/include/stdint.h | 127 ------------------ .../depr/depr.c.headers/extern_c.pass.cpp | 3 - libcxx/utils/libcxx/header_information.py | 2 - 8 files changed, 204 deletions(-) delete mode 100644 libcxx/include/locale.h delete mode 100644 libcxx/include/stdint.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 3431ea7dab38..a107314518b1 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -967,7 +967,6 @@ set(files limits list locale - locale.h map math.h mdspan @@ -998,7 +997,6 @@ set(files stdbool.h stddef.h stdexcept - stdint.h stdio.h stdlib.h stop_token diff --git a/libcxx/include/clocale b/libcxx/include/clocale index c689a64be288..4d53aa7eb29b 100644 --- a/libcxx/include/clocale +++ b/libcxx/include/clocale @@ -38,14 +38,6 @@ lconv* localeconv(); #include -#ifndef _LIBCPP_LOCALE_H -# error tried including but didn't find libc++'s header. \ - This usually means that your header search paths are not configured properly. \ - The header search paths should contain the C++ Standard Library headers before \ - any C Standard Library, and you are probably using compiler flags that make that \ - not be the case. -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif diff --git a/libcxx/include/cstdint b/libcxx/include/cstdint index 8c4782859426..9c9b2323d06e 100644 --- a/libcxx/include/cstdint +++ b/libcxx/include/cstdint @@ -144,14 +144,6 @@ Types: #include -#ifndef _LIBCPP_STDINT_H -# error tried including but didn't find libc++'s header. \ - This usually means that your header search paths are not configured properly. \ - The header search paths should contain the C++ Standard Library headers before \ - any C Standard Library, and you are probably using compiler flags that make that \ - not be the case. -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif diff --git a/libcxx/include/locale.h b/libcxx/include/locale.h deleted file mode 100644 index 425bf47d437a..000000000000 --- a/libcxx/include/locale.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_LOCALE_H -#define _LIBCPP_LOCALE_H - -/* - locale.h synopsis - -Macros: - - LC_ALL - LC_COLLATE - LC_CTYPE - LC_MONETARY - LC_NUMERIC - LC_TIME - -Types: - - lconv - -Functions: - - setlocale - localeconv - -*/ - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#if __has_include_next() -# include_next -#endif - -#endif // _LIBCPP_LOCALE_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 5a0e199394d0..06e93d245290 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -2184,10 +2184,6 @@ module std_inttypes_h [system] { header "inttypes.h" export * } -module std_locale_h [system] { - header "locale.h" - export * -} module std_math_h [system] { header "math.h" export * @@ -2204,10 +2200,6 @@ module std_stddef_h [system] { // 's __need_* macros require textual inclusion. textual header "stddef.h" } -module std_stdint_h [system] { - header "stdint.h" - export * -} module std_stdio_h [system] { // 's __need_* macros require textual inclusion. textual header "stdio.h" diff --git a/libcxx/include/stdint.h b/libcxx/include/stdint.h deleted file mode 100644 index 35e5b8cbdad2..000000000000 --- a/libcxx/include/stdint.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_STDINT_H -// AIX system headers need stdint.h to be re-enterable while _STD_TYPES_T -// is defined until an inclusion of it without _STD_TYPES_T occurs, in which -// case the header guard macro is defined. -#if !defined(_AIX) || !defined(_STD_TYPES_T) -# define _LIBCPP_STDINT_H -#endif // _STD_TYPES_T - -/* - stdint.h synopsis - -Macros: - - INT8_MIN - INT16_MIN - INT32_MIN - INT64_MIN - - INT8_MAX - INT16_MAX - INT32_MAX - INT64_MAX - - UINT8_MAX - UINT16_MAX - UINT32_MAX - UINT64_MAX - - INT_LEAST8_MIN - INT_LEAST16_MIN - INT_LEAST32_MIN - INT_LEAST64_MIN - - INT_LEAST8_MAX - INT_LEAST16_MAX - INT_LEAST32_MAX - INT_LEAST64_MAX - - UINT_LEAST8_MAX - UINT_LEAST16_MAX - UINT_LEAST32_MAX - UINT_LEAST64_MAX - - INT_FAST8_MIN - INT_FAST16_MIN - INT_FAST32_MIN - INT_FAST64_MIN - - INT_FAST8_MAX - INT_FAST16_MAX - INT_FAST32_MAX - INT_FAST64_MAX - - UINT_FAST8_MAX - UINT_FAST16_MAX - UINT_FAST32_MAX - UINT_FAST64_MAX - - INTPTR_MIN - INTPTR_MAX - UINTPTR_MAX - - INTMAX_MIN - INTMAX_MAX - - UINTMAX_MAX - - PTRDIFF_MIN - PTRDIFF_MAX - - SIG_ATOMIC_MIN - SIG_ATOMIC_MAX - - SIZE_MAX - - WCHAR_MIN - WCHAR_MAX - - WINT_MIN - WINT_MAX - - INT8_C(value) - INT16_C(value) - INT32_C(value) - INT64_C(value) - - UINT8_C(value) - UINT16_C(value) - UINT32_C(value) - UINT64_C(value) - - INTMAX_C(value) - UINTMAX_C(value) - -*/ - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -/* C99 stdlib (e.g. glibc < 2.18) does not provide macros needed - for C++11 unless __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS - are defined -*/ -#if defined(__cplusplus) && !defined(__STDC_LIMIT_MACROS) -# define __STDC_LIMIT_MACROS -#endif -#if defined(__cplusplus) && !defined(__STDC_CONSTANT_MACROS) -# define __STDC_CONSTANT_MACROS -#endif - -#if __has_include_next() -# include_next -#endif - -#endif // _LIBCPP_STDINT_H diff --git a/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp b/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp index 9fa4021e5c1e..63ca66437971 100644 --- a/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp +++ b/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp @@ -26,9 +26,6 @@ extern "C" { #include #include #include -#ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include -#endif #include #include #include diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py index 6bebf3302ffa..2ed52e8c1dbf 100644 --- a/libcxx/utils/libcxx/header_information.py +++ b/libcxx/utils/libcxx/header_information.py @@ -15,7 +15,6 @@ header_restrictions = { # headers with #error directives "ios": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "locale.h": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", # transitive includers of the above headers "clocale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", "codecvt": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", @@ -53,7 +52,6 @@ lit_header_restrictions = { "istream": "// UNSUPPORTED: no-localization", "latch": "// UNSUPPORTED: no-threads, c++03, c++11, c++14, c++17", "locale": "// UNSUPPORTED: no-localization", - "locale.h": "// UNSUPPORTED: no-localization", "mutex": "// UNSUPPORTED: no-threads, c++03", "ostream": "// UNSUPPORTED: no-localization", "print": "// UNSUPPORTED: no-filesystem, c++03, c++11, c++14, c++17, c++20, availability-fp_to_chars-missing", # TODO PRINT investigate -- GitLab From 5a47d48034dd3473eafd621b6f81647dd449f8e3 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sun, 20 Oct 2024 14:01:29 +0000 Subject: [PATCH 187/511] [gn build] Port aa7f377c965c --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index e5628a1d8604..3b452939839b 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1039,7 +1039,6 @@ if (current_toolchain == default_toolchain) { "limits", "list", "locale", - "locale.h", "map", "math.h", "mdspan", @@ -1070,7 +1069,6 @@ if (current_toolchain == default_toolchain) { "stdbool.h", "stddef.h", "stdexcept", - "stdint.h", "stdio.h", "stdlib.h", "stop_token", -- GitLab From 490b7d12f6bef2c399fca83e6a6dde31be021913 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Sun, 20 Oct 2024 17:44:48 +0200 Subject: [PATCH 188/511] [clang][NFC] Pass const ASTContext& to CXXTypeidExpr API (#113083) --- clang/include/clang/AST/ExprCXX.h | 4 ++-- clang/lib/AST/ExprCXX.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 975bcdac5069..cfe3938f8384 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -876,13 +876,13 @@ public: /// Best-effort check if the expression operand refers to a most derived /// object. This is not a strong guarantee. - bool isMostDerived(ASTContext &Context) const; + bool isMostDerived(const ASTContext &Context) const; bool isTypeOperand() const { return Operand.is(); } /// Retrieves the type operand of this typeid() expression after /// various required adjustments (removing reference types, cv-qualifiers). - QualType getTypeOperand(ASTContext &Context) const; + QualType getTypeOperand(const ASTContext &Context) const; /// Retrieve source information for the type operand. TypeSourceInfo *getTypeOperandSourceInfo() const { diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 83ce404add5f..a2c0c60d43dd 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -147,7 +147,7 @@ bool CXXTypeidExpr::isPotentiallyEvaluated() const { return false; } -bool CXXTypeidExpr::isMostDerived(ASTContext &Context) const { +bool CXXTypeidExpr::isMostDerived(const ASTContext &Context) const { assert(!isTypeOperand() && "Cannot call isMostDerived for typeid(type)"); const Expr *E = getExprOperand()->IgnoreParenNoopCasts(Context); if (const auto *DRE = dyn_cast(E)) { @@ -159,7 +159,7 @@ bool CXXTypeidExpr::isMostDerived(ASTContext &Context) const { return false; } -QualType CXXTypeidExpr::getTypeOperand(ASTContext &Context) const { +QualType CXXTypeidExpr::getTypeOperand(const ASTContext &Context) const { assert(isTypeOperand() && "Cannot call getTypeOperand for typeid(expr)"); Qualifiers Quals; return Context.getUnqualifiedArrayType( -- GitLab From 2ce10f0491142863d3f21cd0adb312ab2cfed107 Mon Sep 17 00:00:00 2001 From: Job Henandez Lara Date: Sun, 20 Oct 2024 09:05:41 -0700 Subject: [PATCH 189/511] [libc] Remove the header in libc/src and libc/test (#113076) --- libc/src/string/CMakeLists.txt | 10 +++++++--- libc/src/string/strcat.h | 2 +- libc/src/string/strcpy.h | 2 +- libc/src/string/strdup.h | 2 +- libc/src/string/strlcat.h | 2 +- libc/src/string/strlcpy.h | 2 +- libc/src/string/strlen.h | 2 +- libc/src/string/strncat.h | 2 +- libc/src/string/strndup.h | 2 +- 9 files changed, 15 insertions(+), 11 deletions(-) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 787188ab3beb..b33cbc5358d6 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -138,6 +138,7 @@ add_entrypoint_object( DEPENDS .strcpy .string_utils + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -240,6 +241,7 @@ add_entrypoint_object( .string_utils libc.include.stdlib libc.src.errno.errno + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -270,7 +272,7 @@ add_entrypoint_object( strlcat.h DEPENDS .string_utils - libc.include.string + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -281,7 +283,7 @@ add_entrypoint_object( strlcpy.h DEPENDS .string_utils - libc.include.string + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -292,7 +294,7 @@ add_entrypoint_object( strlen.h DEPENDS .string_utils - libc.include.string + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -304,6 +306,7 @@ add_entrypoint_object( DEPENDS .strncpy .string_utils + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( @@ -346,6 +349,7 @@ add_entrypoint_object( .string_utils libc.include.stdlib libc.src.__support.CPP.new + libc.include.llvm-libc-types.size_t ) add_entrypoint_object( diff --git a/libc/src/string/strcat.h b/libc/src/string/strcat.h index 90a7fd2e4133..82860196ce29 100644 --- a/libc/src/string/strcat.h +++ b/libc/src/string/strcat.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRCAT_H #define LLVM_LIBC_SRC_STRING_STRCAT_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strcpy.h b/libc/src/string/strcpy.h index d4f3e81fdc73..9e0c3dbc39ef 100644 --- a/libc/src/string/strcpy.h +++ b/libc/src/string/strcpy.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRCPY_H #define LLVM_LIBC_SRC_STRING_STRCPY_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strdup.h b/libc/src/string/strdup.h index 45303a3efeb4..2744e53d45d4 100644 --- a/libc/src/string/strdup.h +++ b/libc/src/string/strdup.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRDUP_H #define LLVM_LIBC_SRC_STRING_STRDUP_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strlcat.h b/libc/src/string/strlcat.h index ffe97af62a54..9dc8f3a3bc0d 100644 --- a/libc/src/string/strlcat.h +++ b/libc/src/string/strlcat.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRLCAT_H #define LLVM_LIBC_SRC_STRING_STRLCAT_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strlcpy.h b/libc/src/string/strlcpy.h index 058e7653b1b9..45b2c2a2ec26 100644 --- a/libc/src/string/strlcpy.h +++ b/libc/src/string/strlcpy.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRLCPY_H #define LLVM_LIBC_SRC_STRING_STRLCPY_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strlen.h b/libc/src/string/strlen.h index f07bf73ace3d..093edcf479bc 100644 --- a/libc/src/string/strlen.h +++ b/libc/src/string/strlen.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRLEN_H #define LLVM_LIBC_SRC_STRING_STRLEN_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strncat.h b/libc/src/string/strncat.h index 1a130799f396..f37d9a7bc154 100644 --- a/libc/src/string/strncat.h +++ b/libc/src/string/strncat.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRNCAT_H #define LLVM_LIBC_SRC_STRING_STRNCAT_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strndup.h b/libc/src/string/strndup.h index 03370cc8d7dc..78cde7b33e13 100644 --- a/libc/src/string/strndup.h +++ b/libc/src/string/strndup.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_STRING_STRNDUP_H #define LLVM_LIBC_SRC_STRING_STRNDUP_H +#include "include/llvm-libc-types/size_t.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { -- GitLab From f13d3f72118b83e326169592e8f3c5962fd0eb29 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 09:05:55 -0700 Subject: [PATCH 190/511] [Tooling] Simplify code with StringMap::operator[] (NFC) (#113071) --- clang/lib/Tooling/Inclusions/HeaderIncludes.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp index 0cb96097415e..01b3be700b9f 100644 --- a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp +++ b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp @@ -335,10 +335,9 @@ HeaderIncludes::HeaderIncludes(StringRef FileName, StringRef Code, // \p Offset: the start of the line following this include directive. void HeaderIncludes::addExistingInclude(Include IncludeToAdd, unsigned NextLineOffset) { - auto Iter = - ExistingIncludes.try_emplace(trimInclude(IncludeToAdd.Name)).first; - Iter->second.push_back(std::move(IncludeToAdd)); - auto &CurInclude = Iter->second.back(); + auto &Incs = ExistingIncludes[trimInclude(IncludeToAdd.Name)]; + Incs.push_back(std::move(IncludeToAdd)); + auto &CurInclude = Incs.back(); // The header name with quotes or angle brackets. // Only record the offset of current #include if we can insert after it. if (CurInclude.R.getOffset() <= MaxInsertOffset) { -- GitLab From 6ec113d4c35db934ec8fdb3d226d2d8e525a1f84 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 09:06:22 -0700 Subject: [PATCH 191/511] [Local] Avoid repeated map lookups (NFC) (#113072) --- llvm/lib/Transforms/Utils/Local.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 06813bac7c78..65c1669f92b4 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3840,11 +3840,11 @@ static const std::optional & collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, std::map> &BPS, int Depth, bool &FoundRoot) { - auto I = BPS.find(V); - if (I != BPS.end()) + auto [I, Inserted] = BPS.try_emplace(V); + if (!Inserted) return I->second; - auto &Result = BPS[V] = std::nullopt; + auto &Result = I->second; auto BitWidth = V->getType()->getScalarSizeInBits(); // Can't do integer/elements > 128 bits. -- GitLab From 8673d0e0673dd1a5e6f7a5df7509c45e33582987 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 09:07:10 -0700 Subject: [PATCH 192/511] [lldb] Avoid repeated map lookups (NFC) (#113073) --- lldb/source/Interpreter/Options.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp index 6a90b2cc9b98..893a3b71604b 100644 --- a/lldb/source/Interpreter/Options.cpp +++ b/lldb/source/Interpreter/Options.cpp @@ -251,12 +251,9 @@ Option *Options::GetLongOptions() { m_getopt_table[i].flag = nullptr; m_getopt_table[i].val = short_opt; - if (option_seen.find(short_opt) == option_seen.end()) { - option_seen[short_opt] = i; - } else if (short_opt) { + auto [pos, inserted] = option_seen.try_emplace(short_opt, i); + if (!inserted && short_opt) { m_getopt_table[i].val = 0; - std::map::const_iterator pos = - option_seen.find(short_opt); StreamString strm; if (defs[i].HasShortOption()) Debugger::ReportError( -- GitLab From 5405ba50de6753e3969f4e6c690f53f2abb29b2f Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Sun, 20 Oct 2024 19:37:41 +0200 Subject: [PATCH 193/511] [clang][bytecode] Check ia32_{pext,pdep} builtins for integer args (#113091) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d4a8e6c2035e..c7b9dac5fec8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1331,6 +1331,10 @@ static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { + if (!Call->getArg(0)->getType()->isIntegerType() || + !Call->getArg(1)->getType()->isIntegerType()) + return false; + PrimType ValT = *S.Ctx.classify(Call->getArg(0)); PrimType MaskT = *S.Ctx.classify(Call->getArg(1)); @@ -1352,6 +1356,10 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { + if (!Call->getArg(0)->getType()->isIntegerType() || + !Call->getArg(1)->getType()->isIntegerType()) + return false; + PrimType ValT = *S.Ctx.classify(Call->getArg(0)); PrimType MaskT = *S.Ctx.classify(Call->getArg(1)); -- GitLab From b9cb9b3f0d1e891b385eb53f8414b29554fd9234 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 10:41:07 -0700 Subject: [PATCH 194/511] [GVNSink] Avoid repeated hash lookups (NFC) (#113023) --- llvm/lib/Transforms/Scalar/GVNSink.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 3dfa2dd9df27..59dfe33d8003 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -535,14 +535,10 @@ public: uint32_t e = ExpressionNumbering[exp]; if (!e) { hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); }); - auto I = HashNumbering.find(H); - if (I != HashNumbering.end()) { - e = I->second; - } else { - e = nextValueNumber++; - HashNumbering[H] = e; - ExpressionNumbering[exp] = e; - } + auto [I, Inserted] = HashNumbering.try_emplace(H, nextValueNumber); + e = I->second; + if (Inserted) + ExpressionNumbering[exp] = nextValueNumber++; } ValueNumbering[V] = e; return e; -- GitLab From 3bddf85e5274b302915f77cec3e1ac60c9309ebd Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 10:41:42 -0700 Subject: [PATCH 195/511] [sancov] Avoid repeated map lookups (NFC) (#113026) --- llvm/tools/sancov/sancov.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp index 80f9996ba705..b969fc651e0c 100644 --- a/llvm/tools/sancov/sancov.cpp +++ b/llvm/tools/sancov/sancov.cpp @@ -961,10 +961,9 @@ static FunctionLocs resolveFunctions(const SymbolizedCoverage &Coverage, continue; auto P = std::make_pair(Loc.Line, Loc.Column); - auto I = Result.find(Fn); - if (I == Result.end() || I->second > P) { - Result[Fn] = P; - } + auto [It, Inserted] = Result.try_emplace(Fn, P); + if (!Inserted && It->second > P) + It->second = P; } } return Result; -- GitLab From 2077fb80ffb58cd1060ec6a5475399c6ad297df3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 10:42:28 -0700 Subject: [PATCH 196/511] [mlir] Avoid repeated map lookups (NFC) (#113074) --- mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 7b1b1f383e63..32e1dcbc2cce 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -1160,9 +1160,8 @@ bool mlir::sparse_tensor::isBlockSparsity(AffineMap dimToLvl) { } else if (auto dimOp = dyn_cast(result)) { auto pos = dimOp.getPosition(); // Expect dim to be unset. - if (coeffientMap.find(pos) != coeffientMap.end()) + if (!coeffientMap.try_emplace(pos, 0).second) return false; - coeffientMap[pos] = 0; } else { return false; } -- GitLab From d1401822e2d2753bed3ac597a42cc0b261de40a4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 20 Oct 2024 10:42:53 -0700 Subject: [PATCH 197/511] [Support] Use a hetrogenous lookup with std::map (NFC) (#113075) --- llvm/lib/Support/VirtualFileSystem.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 928c0b5a24ed..3e79ecf2fc7e 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -750,7 +750,7 @@ public: class InMemoryDirectory : public InMemoryNode { Status Stat; - std::map> Entries; + std::map, std::less<>> Entries; public: InMemoryDirectory(Status Stat) @@ -766,7 +766,7 @@ public: UniqueID getUniqueID() const { return Stat.getUniqueID(); } InMemoryNode *getChild(StringRef Name) const { - auto I = Entries.find(Name.str()); + auto I = Entries.find(Name); if (I != Entries.end()) return I->second.get(); return nullptr; -- GitLab From c2717a89b8437d041d532c7b2c535ca4f4b35872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 20 Oct 2024 13:51:50 +0300 Subject: [PATCH 198/511] [compiler-rt] [test] Remove an unintended grep parameter This parameter seems unintentional here; we're trying to grep the input on stdin, from the earlier stage in the pipeline. Since a recent update on Github Actions runners, the previous form (grepping a file, while piping in data on stdin) would fail running the test, with the test runner Python script throwing an exception when evaluating it: File "D:\a\llvm-mingw\llvm-mingw\llvm-project\llvm\utils\lit\lit\TestRunner.py", line 935, in _executeShCmd out = procs[i].stdout.read() ^^^^^^^^^^^^^^^^^^^^^^ File "C:\hostedtoolcache\windows\Python\3.12.7\x64\Lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: a bytes-like object is required, not 'NoneType' --- compiler-rt/test/asan/TestCases/Windows/delay_dbghelp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/asan/TestCases/Windows/delay_dbghelp.cpp b/compiler-rt/test/asan/TestCases/Windows/delay_dbghelp.cpp index 9277fe0b2351..38e99cf68594 100644 --- a/compiler-rt/test/asan/TestCases/Windows/delay_dbghelp.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/delay_dbghelp.cpp @@ -9,7 +9,7 @@ // static build, there won't be any clang_rt DLLs. // RUN: not grep cl""ang_rt %t || \ // RUN: grep cl""ang_rt %t | xargs which | \ -// RUN: xargs llvm-readobj --coff-imports | not grep dbghelp.dll %t +// RUN: xargs llvm-readobj --coff-imports | not grep dbghelp.dll extern "C" int puts(const char *); -- GitLab From 20bda93e438c63fb68a8130b7f88090c558e99b7 Mon Sep 17 00:00:00 2001 From: Fawdlstty Date: Mon, 21 Oct 2024 05:17:15 +0800 Subject: [PATCH 199/511] [TLI] Add basic support for scalbnxx (#112936) This patch adds basic support for `scalbln, scalblnf, scalblnl, scalbn, scalbnf, scalbnl`. Constant folding support will be submitted in a subsequent patch. Related issue: <#112631> --- .../llvm/Analysis/TargetLibraryInfo.def | 30 +++++++++++++++++ llvm/lib/Analysis/TargetLibraryInfo.cpp | 8 +++++ llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 6 ++++ .../Transforms/InferFunctionAttrs/annotate.ll | 18 +++++++++++ .../tools/llvm-tli-checker/ps4-tli-check.yaml | 32 ++++++++++++++++--- .../Analysis/TargetLibraryInfoTest.cpp | 6 ++++ 6 files changed, 96 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index d472cde3d504..f890e2b9ec4c 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -2162,6 +2162,36 @@ TLI_DEFINE_ENUM_INTERNAL(roundl) TLI_DEFINE_STRING_INTERNAL("roundl") TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl) +/// double scalbln(double arg, long exp); +TLI_DEFINE_ENUM_INTERNAL(scalbln) +TLI_DEFINE_STRING_INTERNAL("scalbln") +TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Long) + +/// float scalblnf(float arg, long exp); +TLI_DEFINE_ENUM_INTERNAL(scalblnf) +TLI_DEFINE_STRING_INTERNAL("scalblnf") +TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Long) + +/// long double scalblnl(long double arg, long exp); +TLI_DEFINE_ENUM_INTERNAL(scalblnl) +TLI_DEFINE_STRING_INTERNAL("scalblnl") +TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, Long) + +/// double scalbn(double arg, int exp); +TLI_DEFINE_ENUM_INTERNAL(scalbn) +TLI_DEFINE_STRING_INTERNAL("scalbn") +TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Int) + +/// float scalbnf(float arg, int exp); +TLI_DEFINE_ENUM_INTERNAL(scalbnf) +TLI_DEFINE_STRING_INTERNAL("scalbnf") +TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Int) + +/// long double scalbnl(long double arg, int exp); +TLI_DEFINE_ENUM_INTERNAL(scalbnl) +TLI_DEFINE_STRING_INTERNAL("scalbnl") +TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, Int) + /// int scanf(const char *restrict format, ... ); TLI_DEFINE_ENUM_INTERNAL(scanf) TLI_DEFINE_STRING_INTERNAL("scanf") diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index d9651d2f47c6..0ee83d217a50 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -382,6 +382,12 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_rintf); TLI.setUnavailable(LibFunc_round); TLI.setUnavailable(LibFunc_roundf); + TLI.setUnavailable(LibFunc_scalbln); + TLI.setUnavailable(LibFunc_scalblnf); + TLI.setUnavailable(LibFunc_scalblnl); + TLI.setUnavailable(LibFunc_scalbn); + TLI.setUnavailable(LibFunc_scalbnf); + TLI.setUnavailable(LibFunc_scalbnl); TLI.setUnavailable(LibFunc_trunc); TLI.setUnavailable(LibFunc_truncf); } @@ -404,6 +410,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_nearbyintl); TLI.setUnavailable(LibFunc_rintl); TLI.setUnavailable(LibFunc_roundl); + TLI.setUnavailable(LibFunc_scalblnl); + TLI.setUnavailable(LibFunc_scalbnl); TLI.setUnavailable(LibFunc_truncl); // Win32 does not support these functions, but diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index c97a77d12e3e..13323604eb51 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1249,6 +1249,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_round: case LibFunc_roundf: case LibFunc_roundl: + case LibFunc_scalbln: + case LibFunc_scalblnf: + case LibFunc_scalblnl: + case LibFunc_scalbn: + case LibFunc_scalbnf: + case LibFunc_scalbnl: case LibFunc_sin: case LibFunc_sincospif_stret: case LibFunc_sinf: diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index 8567cc00ed00..3e9b2d94efda 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -876,6 +876,24 @@ declare float @roundf(float) ; CHECK: declare x86_fp80 @roundl(x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] declare x86_fp80 @roundl(x86_fp80) +; CHECK: declare double @scalbln(double, i64) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare double @scalbln(double, i64) + +; CHECK: declare float @scalblnf(float, i64) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare float @scalblnf(float, i64) + +; CHECK: declare x86_fp80 @scalblnl(x86_fp80, i64) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare x86_fp80 @scalblnl(x86_fp80, i64) + +; CHECK: declare double @scalbn(double, i32) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare double @scalbn(double, i32) + +; CHECK: declare float @scalbnf(float, i32) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare float @scalbnf(float, i32) + +; CHECK: declare x86_fp80 @scalbnl(x86_fp80, i32) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare x86_fp80 @scalbnl(x86_fp80, i32) + ; CHECK: declare noundef i32 @scanf(ptr nocapture noundef readonly, ...) [[NOFREE_NOUNWIND]] declare i32 @scanf(ptr, ...) diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index aad5794fd8c2..20e7e15e3efb 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -34,7 +34,7 @@ # # CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 -# CHECK: == Total TLI yes SDK yes: 259 +# CHECK: == Total TLI yes SDK yes: 265 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) @@ -48,14 +48,14 @@ # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' # WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} -# WRONG_SUMMARY: == Total TLI yes SDK yes: 258 +# WRONG_SUMMARY: == Total TLI yes SDK yes: 264 # ## The -COUNT suffix doesn't care if there are too many matches, so check ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 510 symbols, 277 available -# AVAIL-COUNT-277: {{^}} available +# AVAIL: TLI knows 516 symbols, 283 available +# AVAIL-COUNT-283: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-233: not available # UNAVAIL-NOT: not available @@ -866,6 +866,30 @@ DynamicSymbols: Type: STT_FUNC Section: .text Binding: STB_GLOBAL + - Name: scalbln + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: scalblnf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: scalblnl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: scalbn + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: scalbnf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: scalbnl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL - Name: scanf Type: STT_FUNC Section: .text diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp index b4856b50bbe5..346940384aff 100644 --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -335,6 +335,12 @@ TEST_F(TargetLibraryInfoTest, ValidProto) { "declare double @roundeven(double)\n" "declare float @roundevenf(float)\n" "declare x86_fp80 @roundevenl(x86_fp80)\n" + "declare double @scalbln(double, i64)\n" + "declare float @scalblnf(float, i64)\n" + "declare x86_fp80 @scalblnl(x86_fp80, i64)\n" + "declare double @scalbn(double, i32)\n" + "declare float @scalbnf(float, i32)\n" + "declare x86_fp80 @scalbnl(x86_fp80, i32)\n" "declare i32 @scanf(i8*, ...)\n" "declare void @setbuf(%struct*, i8*)\n" "declare i32 @setitimer(i32, %struct*, %struct*)\n" -- GitLab From 173907b5d77115623f160978a95159e36e05ee6c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 20 Oct 2024 17:26:15 -0700 Subject: [PATCH 200/511] [LV] Move logic to check if op is invariant to legacy cost model. (NFC) This allows the function to be re-used in other places --- .../Transforms/Vectorize/LoopVectorize.cpp | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0d35bfb921dc..e8653498d32a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1527,6 +1527,10 @@ public: getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const; + /// Returns true if \p Op should be considered invariant and if it is + /// trivially hoistable. + bool shouldConsiderInvariant(Value *Op); + private: unsigned NumPredStores = 0; @@ -6382,6 +6386,17 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { } } +bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { + if (!Legal->isInvariant(Op)) + return false; + // Consider Op invariant, if it or its operands aren't predicated + // instruction in the loop. In that case, it is not trivially hoistable. + return !isa(Op) || !TheLoop->contains(cast(Op)) || + (!isPredicatedInst(cast(Op)) && + all_of(cast(Op)->operands(), + [this](Value *Op) { return shouldConsiderInvariant(Op); })); +} + InstructionCost LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { @@ -6621,19 +6636,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2 = cast(PSE.getSCEV(Op2))->getValue(); } auto Op2Info = TTI.getOperandInfo(Op2); - std::function IsInvariant = - [this, &IsInvariant](Value *Op) -> bool { - if (!Legal->isInvariant(Op)) - return false; - // Consider Op2invariant, if it or its operands aren't predicated - // instruction in the loop. In that case, it is not trivially hoistable. - return !isa(Op) || - !TheLoop->contains(cast(Op)) || - (!isPredicatedInst(cast(Op)) && - all_of(cast(Op)->operands(), - [&IsInvariant](Value *Op) { return IsInvariant(Op); })); - }; - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && IsInvariant(Op2)) + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + shouldConsiderInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); -- GitLab From d80b9cf713fd1698641c5b265de6b66618991476 Mon Sep 17 00:00:00 2001 From: Thomas Fransham Date: Mon, 21 Oct 2024 06:14:52 +0100 Subject: [PATCH 201/511] [Clang][ASTMatchers] Add visibility macros to variables declared by macros (#110206) This will fix missing symbols for ASTMatchersTests on windows when building with CLANG_LINK_CLANG and explicit visibility macros are used. This PR depends on macros that will be be added in #108276 This is part of the work to enable LLVM_BUILD_LLVM_DYLIB and LLVM\Clang plugins on window. --- clang/include/clang/ASTMatchers/ASTMatchersMacros.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/ASTMatchers/ASTMatchersMacros.h b/clang/include/clang/ASTMatchers/ASTMatchersMacros.h index 592a3898a295..f781e0a565eb 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchersMacros.h +++ b/clang/include/clang/ASTMatchers/ASTMatchersMacros.h @@ -49,6 +49,8 @@ #ifndef LLVM_CLANG_ASTMATCHERS_ASTMATCHERSMACROS_H #define LLVM_CLANG_ASTMATCHERS_ASTMATCHERSMACROS_H +#include "clang/Support/Compiler.h" + /// AST_MATCHER_FUNCTION(ReturnType, DefineMatcher) { ... } /// defines a zero parameter function named DefineMatcher() that returns a /// ReturnType object. @@ -367,7 +369,7 @@ static QualType (T::*value())() const { return &T::FunctionName; } \ }; \ } \ - extern const ::clang::ast_matchers::internal:: \ + CLANG_ABI extern const ::clang::ast_matchers::internal:: \ TypeTraversePolymorphicMatcher< \ QualType, \ ::clang::ast_matchers::internal::TypeMatcher##MatcherName##Getter, \ @@ -407,7 +409,7 @@ static TypeLoc (T::*value())() const { return &T::FunctionName##Loc; } \ }; \ } \ - extern const ::clang::ast_matchers::internal:: \ + CLANG_ABI extern const ::clang::ast_matchers::internal:: \ TypeTraversePolymorphicMatcher< \ TypeLoc, \ ::clang::ast_matchers::internal:: \ -- GitLab From df8b785838a2db01b4d056e603f7317209accefb Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 21 Oct 2024 07:51:49 +0200 Subject: [PATCH 202/511] [clang][bytecode] Narrow pointer in UO_Deref unary operators (#113089) Otherwise we treat this like an array element even though we should treat it as a single object. --- clang/lib/AST/ByteCode/Compiler.cpp | 12 ++++++++++-- clang/lib/AST/ByteCode/Pointer.cpp | 2 +- clang/test/AST/ByteCode/cxx98.cpp | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 672fa7fc25d6..3f068aa8c189 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5738,9 +5738,17 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { // We should already have a pointer when we get here. return this->delegate(SubExpr); case UO_Deref: // *x - if (DiscardResult) + if (DiscardResult) { + // assert(false); return this->discard(SubExpr); - return this->visit(SubExpr); + } + + if (!this->visit(SubExpr)) + return false; + if (classifyPrim(SubExpr) == PT_Ptr) + return this->emitNarrowPtr(E); + return true; + case UO_Not: // ~x if (!T) return this->emitError(E); diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index 75b00dcb2ab2..c9de039c195d 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -635,7 +635,7 @@ std::optional Pointer::toRValue(const Context &Ctx, // Return the composite type. APValue Result; - if (!Composite(getType(), *this, Result)) + if (!Composite(ResultType, *this, Result)) return std::nullopt; return Result; } diff --git a/clang/test/AST/ByteCode/cxx98.cpp b/clang/test/AST/ByteCode/cxx98.cpp index 471a58f8e055..20f98d33c31c 100644 --- a/clang/test/AST/ByteCode/cxx98.cpp +++ b/clang/test/AST/ByteCode/cxx98.cpp @@ -54,3 +54,8 @@ _Static_assert(a == 0, ""); // both-error {{static assertion expression is not a struct SelfReference { SelfReference &r; }; extern SelfReference self_reference_1; SelfReference self_reference_2 = {self_reference_1}; + +struct PR65784s{ + int *ptr; +} const PR65784[] = {(int *)""}; +PR65784s PR65784f() { return *PR65784; } -- GitLab From 615a5eb02c91ef78f59461f842873617dd187450 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 21 Oct 2024 08:15:51 +0200 Subject: [PATCH 203/511] [clang][bytecode] Check ai32_bextr builtins for integer args (#113128) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index c7b9dac5fec8..10e33c14f4b4 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1253,6 +1253,10 @@ static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { + if (!Call->getArg(0)->getType()->isIntegerType() || + !Call->getArg(1)->getType()->isIntegerType()) + return false; + PrimType ValT = *S.Ctx.classify(Call->getArg(0)); PrimType IndexT = *S.Ctx.classify(Call->getArg(1)); APSInt Val = peekToAPSInt(S.Stk, ValT, -- GitLab From 9b49392d6edcdfcc59304350ebd4196be5180d4a Mon Sep 17 00:00:00 2001 From: Thirumalai Shaktivel <74826228+Thirumalai-Shaktivel@users.noreply.github.com> Date: Mon, 21 Oct 2024 13:07:48 +0530 Subject: [PATCH 204/511] [Flang] Handle the source (scopes) for some OpenMP constructs (#109097) Fixes: https://github.com/llvm/llvm-project/issues/82943 Fixes: https://github.com/llvm/llvm-project/issues/82942 Fixes: https://github.com/llvm/llvm-project/issues/85593 --- flang/lib/Parser/openmp-parsers.cpp | 33 ++++++++++--------- flang/lib/Semantics/resolve-names.cpp | 17 ++++++++++ .../test/Semantics/OpenMP/atomic06-empty.f90 | 6 ++++ .../Semantics/OpenMP/declare-simd-empty.f90 | 6 ++++ .../OpenMP/threadprivate08-empty.f90 | 5 +++ 5 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/atomic06-empty.f90 create mode 100644 flang/test/Semantics/OpenMP/declare-simd-empty.f90 create mode 100644 flang/test/Semantics/OpenMP/threadprivate08-empty.f90 diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 8634c522cf34..52c7529369df 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -657,33 +657,36 @@ TYPE_PARSER(construct(startOmpLine >> "END ATOMIC"_tok)) // OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] READ [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("READ"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), maybe(Parser{} / endOmpLine))) + sourced(construct( + Parser{} / maybe(","_tok), verbatim("READ"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), + maybe(Parser{} / endOmpLine)))) // OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] CAPTURE [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("CAPTURE"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), statement(assignmentStmt), - Parser{} / endOmpLine)) + sourced(construct( + Parser{} / maybe(","_tok), verbatim("CAPTURE"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), + statement(assignmentStmt), Parser{} / endOmpLine))) // OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] UPDATE [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("UPDATE"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), maybe(Parser{} / endOmpLine))) + sourced(construct( + Parser{} / maybe(","_tok), verbatim("UPDATE"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), + maybe(Parser{} / endOmpLine)))) // OMP ATOMIC [atomic-clause-list] -TYPE_PARSER(construct(verbatim("ATOMIC"_tok), +TYPE_PARSER(sourced(construct(verbatim("ATOMIC"_tok), Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine))) + maybe(Parser{} / endOmpLine)))) // OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] WRITE [MEMORY-ORDER-CLAUSE-LIST] TYPE_PARSER("ATOMIC" >> - construct(Parser{} / maybe(","_tok), - verbatim("WRITE"_tok), Parser{} / endOmpLine, - statement(assignmentStmt), maybe(Parser{} / endOmpLine))) + sourced(construct( + Parser{} / maybe(","_tok), verbatim("WRITE"_tok), + Parser{} / endOmpLine, statement(assignmentStmt), + maybe(Parser{} / endOmpLine)))) // Atomic Construct TYPE_PARSER(construct(Parser{}) || diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 2fa5b75e073b..030dbc5ea0f0 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1530,6 +1530,23 @@ public: void Post(const parser::OpenMPDeclarativeAllocate &) { SkipImplicitTyping(false); } + bool Pre(const parser::OpenMPDeclarativeConstruct &x) { + AddOmpSourceRange(x.source); + return true; + } + void Post(const parser::OpenMPDeclarativeConstruct &) { + messageHandler().set_currStmtSource(std::nullopt); + } + bool Pre(const parser::OpenMPAtomicConstruct &x) { + return common::visit(common::visitors{[&](const auto &u) -> bool { + AddOmpSourceRange(u.source); + return true; + }}, + x.u); + } + void Post(const parser::OpenMPAtomicConstruct &) { + messageHandler().set_currStmtSource(std::nullopt); + } }; bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) { diff --git a/flang/test/Semantics/OpenMP/atomic06-empty.f90 b/flang/test/Semantics/OpenMP/atomic06-empty.f90 new file mode 100644 index 000000000000..226e8d1bb91a --- /dev/null +++ b/flang/test/Semantics/OpenMP/atomic06-empty.f90 @@ -0,0 +1,6 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp +! Test the source code starting with omp syntax + +!$omp atomic write +i = 123 +end diff --git a/flang/test/Semantics/OpenMP/declare-simd-empty.f90 b/flang/test/Semantics/OpenMP/declare-simd-empty.f90 new file mode 100644 index 000000000000..b61fb53730a2 --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-simd-empty.f90 @@ -0,0 +1,6 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp +! Test the source code starting with omp syntax + +!$omp declare simd +integer :: x +end diff --git a/flang/test/Semantics/OpenMP/threadprivate08-empty.f90 b/flang/test/Semantics/OpenMP/threadprivate08-empty.f90 new file mode 100644 index 000000000000..38a855dceb83 --- /dev/null +++ b/flang/test/Semantics/OpenMP/threadprivate08-empty.f90 @@ -0,0 +1,5 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp +! Test the source code starting with omp syntax + +!$omp threadprivate(a) +end -- GitLab From 3c5cea650dcef5e5aae8f4090f5b7f410b31fca2 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Mon, 21 Oct 2024 13:21:16 +0530 Subject: [PATCH 205/511] [AMDGPU]: Add implicit-def to the BB prolog (#112872) IMPLICIT_DEF inserted for a wwm-register at the very first block or the predecessor block where it is used for sgpr spilling can appear at a block begin that requires spill-insertion during per-lane VGPR regalloc phase. The presence of the IMPLICIT_DEF currently breaks the BB prolog. Fixes: SWDEV-490717 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 + ...ergence-divergent-i1-used-outside-loop.mir | 6 +-- .../GlobalISel/image-waterfall-loop-O0.ll | 2 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 2 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 18 ++++---- ...nfloop-subrange-spill-inspect-subrange.mir | 27 ++++++----- .../CodeGen/AMDGPU/infloop-subrange-spill.mir | 26 +++++------ .../kernel-vgpr-spill-mubuf-with-voffset.ll | 4 +- llvm/test/CodeGen/AMDGPU/merge-m0.mir | 46 +++++++++---------- 9 files changed, 65 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 76c1ea4e7420..89a2eb4f1894 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8918,6 +8918,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, uint16_t Opcode = MI.getOpcode(); return IsNullOrVectorRegister && (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) || + Opcode == AMDGPU::IMPLICIT_DEF || (!MI.isTerminator() && Opcode != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI))); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir index 5bbe3e488689..465731130279 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir @@ -19,8 +19,8 @@ body: | ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]] - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1) ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc @@ -122,8 +122,8 @@ body: | ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1) ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1) ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc @@ -790,8 +790,8 @@ body: | ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]] - ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index c9426106af5d..88fd7dcce35f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -170,8 +170,8 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_readlane_b32 s4, v16, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 75d0b83a024f..fe17ff169cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1135,11 +1135,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 ; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-O0-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 639b2ff25dcb..603f457f3e05 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5370,9 +5370,9 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s4, v18, 25 -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -6223,8 +6223,8 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b64 exec, s[0:1] ; NOOPT-NEXT: s_cbranch_execz .LBB17_8 ; NOOPT-NEXT: ; %bb.7: ; %bb1 -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -7286,10 +7286,10 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: ; implicit-def: $sgpr2 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 ; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 -; NOOPT-NEXT: ; implicit-def: $sgpr2 ; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 @@ -7316,11 +7316,11 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: ;;#ASMEND ; NOOPT-NEXT: s_branch .LBB19_4 ; NOOPT-NEXT: .LBB19_3: ; %bb4 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -7345,8 +7345,8 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b64 exec, s[8:9] ; NOOPT-NEXT: s_branch .LBB19_1 ; NOOPT-NEXT: .LBB19_4: ; %bb7 -; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr4 ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -7530,10 +7530,10 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v4, 0 ; NOOPT-NEXT: v_readlane_b32 s1, v4, 1 -; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; NOOPT-NEXT: s_mov_b32 s0, 1 ; NOOPT-NEXT: ; implicit-def: $sgpr1 @@ -7561,11 +7561,11 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: ;;#ASMEND ; NOOPT-NEXT: s_branch .LBB20_4 ; NOOPT-NEXT: .LBB20_3: ; %bb4 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -7591,8 +7591,8 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] ; NOOPT-NEXT: s_branch .LBB20_1 ; NOOPT-NEXT: .LBB20_4: ; %bb7 -; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; NOOPT-NEXT: s_mov_b32 s10, s1 ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -9106,9 +9106,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v18, 1 -; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 ; NOOPT-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3 ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 ; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir index 7864564d2891..6603f2ef7ade 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir @@ -27,9 +27,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) + ; CHECK-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 @@ -37,14 +39,15 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.4, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: {{ $}} @@ -52,37 +55,33 @@ body: | ; CHECK-NEXT: SI_RETURN ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr12 = IMPLICIT_DEF - ; CHECK-NEXT: SI_SPILL_S512_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF - ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, killed renamable $sgpr20_sgpr21_sgpr22_sgpr23, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, undef renamable $sgpr4_sgpr5, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.6 diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir index 1030cdb1b43f..fa95f4c13417 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir @@ -23,13 +23,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 @@ -37,49 +37,47 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5 ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.3, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX8_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s256), addrspace 4) + ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: SI_RETURN ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr12 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4) - ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF - ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, killed renamable $sgpr20_sgpr21_sgpr22_sgpr23, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF + ; CHECK-NEXT: liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, undef renamable $sgpr4_sgpr5, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX8_IMM undef renamable $sgpr4_sgpr5, 32, 0 :: (invariant load (s256), addrspace 4) + ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 7b195f8e8622..f9f343268105 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -64,9 +64,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store -; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 +; CHECK-NEXT: s_add_i32 s5, s33, 0x100000 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/merge-m0.mir b/llvm/test/CodeGen/AMDGPU/merge-m0.mir index 614ee6762a27..19a2652e18f6 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-m0.mir @@ -2,9 +2,9 @@ # GCN-LABEL: name: merge-m0-many-init # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN-NEXT: DS_WRITE_B32 # GCN-NEXT: SI_INIT_M0 65536 @@ -128,9 +128,9 @@ body: | # GCN-LABEL: name: merge-m0-dont-hoist-past-init-with-different-initializer # GCN: bb.0.entry: -# GCN: SI_INIT_M0 65536 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 65536 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -187,7 +187,7 @@ body: | # GCN: bb.0.entry: # GCN-NOT: SI_INIT_M0 # GCN: S_OR_B64 -# GCN-NEXT: SI_INIT_M0 +# GCN: SI_INIT_M0 # GCN: bb.1: # GCN-NOT: SI_INIT_M0 -1 @@ -259,8 +259,8 @@ body: | ... # GCN-LABEL: name: move-m0-different-initializer -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF +# GCN: %1:vgpr_32 = IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN: SI_INIT_M0 65536 # GCN-NEXT: S_NOP --- @@ -299,10 +299,10 @@ body: | # GCN-LABEL: name: m0-in-loop-0 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -343,10 +343,10 @@ body: | # GCN-LABEL: name: m0-in-loop-1 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -384,10 +384,10 @@ body: | # GCN-LABEL: name: m0-in-loop-2 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -429,10 +429,10 @@ body: | # GCN-LABEL: name: m0-in-loop-3 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -477,10 +477,10 @@ body: | # GCN-LABEL: name: m0-in-loop-4 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -525,10 +525,10 @@ body: | # GCN-LABEL: name: m0-in-loop-5 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -574,10 +574,10 @@ body: | # GCN-LABEL: name: m0-in-loop-6 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: @@ -627,10 +627,10 @@ body: | # GCN-LABEL: name: m0-in-loop-7 # GCN: bb.0.entry: -# GCN: SI_INIT_M0 -1 -# GCN-NEXT: IMPLICIT_DEF +# GCN: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 # GCN: bb.1: -- GitLab From 923b8eea644a4d1fed0f3e20677514cf3f4e0fcc Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 21 Oct 2024 09:52:07 +0200 Subject: [PATCH 206/511] [clang][bytecode] Allow ArrayElemPtr ops on null pointers (#113132) This regresses one of the _Complex test cases a bit, but since the diagnostic output wasn't very good here in the first place, let's ignore it. --- clang/lib/AST/ByteCode/Interp.h | 12 +++++++----- clang/test/AST/ByteCode/complex.cpp | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index aafc848a9c53..1469fac5a177 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1944,14 +1944,14 @@ inline bool CastMemberPtrPtr(InterpState &S, CodePtr OpPC) { template bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, - const Pointer &Ptr) { + const Pointer &Ptr, bool IsPointerArith = false) { // A zero offset does not change the pointer. if (Offset.isZero()) { S.Stk.push(Ptr); return true; } - if (!CheckNull(S, OpPC, Ptr, CSK_ArrayIndex)) { + if (IsPointerArith && !CheckNull(S, OpPC, Ptr, CSK_ArrayIndex)) { // The CheckNull will have emitted a note already, but we only // abort in C++, since this is fine in C. if (S.getLangOpts().CPlusPlus) @@ -2063,14 +2063,16 @@ bool AddOffset(InterpState &S, CodePtr OpPC) { Pointer Ptr = S.Stk.pop(); if (Ptr.isBlockPointer()) Ptr = Ptr.expand(); - return OffsetHelper(S, OpPC, Offset, Ptr); + return OffsetHelper(S, OpPC, Offset, Ptr, + /*IsPointerArith=*/true); } template ::T> bool SubOffset(InterpState &S, CodePtr OpPC) { const T &Offset = S.Stk.pop(); const Pointer &Ptr = S.Stk.pop(); - return OffsetHelper(S, OpPC, Offset, Ptr); + return OffsetHelper(S, OpPC, Offset, Ptr, + /*IsPointerArith=*/true); } template @@ -2090,7 +2092,7 @@ static inline bool IncDecPtrHelper(InterpState &S, CodePtr OpPC, // Now the current Ptr again and a constant 1. OneT One = OneT::from(1); - if (!OffsetHelper(S, OpPC, One, P)) + if (!OffsetHelper(S, OpPC, One, P, /*IsPointerArith=*/true)) return false; // Store the new value. diff --git a/clang/test/AST/ByteCode/complex.cpp b/clang/test/AST/ByteCode/complex.cpp index dc93c786dac7..ee11c6214b70 100644 --- a/clang/test/AST/ByteCode/complex.cpp +++ b/clang/test/AST/ByteCode/complex.cpp @@ -407,8 +407,7 @@ namespace ComplexConstexpr { // ref-note {{cannot access real component of null}} \ // expected-note {{read of dereferenced null pointer}} constexpr float pi = __imag *p; // both-error {{constant expr}} \ - // ref-note {{cannot access imaginary component of null}} \ - // expected-note {{cannot perform pointer arithmetic on null pointer}} + // ref-note {{cannot access imaginary component of null}} constexpr const _Complex double *q = &test3 + 1; constexpr double qr = __real *q; // ref-error {{constant expr}} \ // ref-note {{cannot access real component of pointer past the end}} -- GitLab From d582442becf1507a243614ee7348ccbb51eade28 Mon Sep 17 00:00:00 2001 From: Ronan Keryell Date: Mon, 21 Oct 2024 00:54:04 -0700 Subject: [PATCH 207/511] [llvm-cxxfilt] Add --quote option to quote demangled function names (#111871) This is useful when looking at LLVM/MLIR assembly produced from C++ sources. For example cir.call @_ZN3aie4tileILi1ELi4EE7programIZ4mainE3$_0EEvOT_(%2, %7) : will be translated to cir.call @"void aie::tile<1, 4>::program(main::$_0&&)"(%2, %7) : which can be parsed as valid MLIR by the right mlir-lsp-server. If a symbol is already quoted, do not quote it more. --------- Co-authored-by: James Henderson --- llvm/docs/CommandGuide/llvm-cxxfilt.rst | 5 +++++ llvm/test/tools/llvm-cxxfilt/quote.test | 26 ++++++++++++++++++++++++ llvm/tools/llvm-cxxfilt/Opts.td | 1 + llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp | 23 ++++++++++++++++----- 4 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 llvm/test/tools/llvm-cxxfilt/quote.test diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst index 0933f0b5bed8..6743e361d752 100644 --- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst +++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst @@ -57,6 +57,11 @@ OPTIONS Do not strip a leading underscore. This is the default for all platforms except Mach-O based hosts. +.. option:: --quote + + Add `"` `"` around demangled function symbols. Do not quote already quoted + symbols. + .. option:: --strip-underscore, -_ Strip a single leading underscore, if present, from each input name before diff --git a/llvm/test/tools/llvm-cxxfilt/quote.test b/llvm/test/tools/llvm-cxxfilt/quote.test new file mode 100644 index 000000000000..15ced1f6935b --- /dev/null +++ b/llvm/test/tools/llvm-cxxfilt/quote.test @@ -0,0 +1,26 @@ +// Show that llvm-cxxfilt --quote adds quotes around demangled symbols, unless +// the symbol is already quoted. + +RUN: split-file %s %t + +RUN: llvm-cxxfilt --quote < %t/symbols-in-file.test | FileCheck --match-full-lines --check-prefix=CHECK-FILE %s +CHECK-FILE: "bar()" "bar()" +CHECK-FILE-NEXT: "bar()" "bar()" +CHECK-FILE: log() +CHECK-FILE: "import thunk for std::future" + +// Check it works with CLI symbols too. Since a quoted mangled name is not a +// mangled name, it should be unchanged. +RUN: llvm-cxxfilt --quote _Z3firv '"_Z3barv"' 'saw()' | FileCheck --match-full-lines --check-prefix=CHECK-CLI %s +CHECK-CLI: "fir()" +CHECK-CLI-NEXT: "_Z3barv" +CHECK-CLI-NEXT: saw() + +//--- symbols-in-file.test +_Z3barv "_Z3barv" +"_Z3barv" _Z3barv +// This is not mangled, thus it should not be quoted. +log() +// Check that an "import thunk for" prefix can be quoted along the demangled +// name. +__imp__ZSt6futureIvE diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td index 034cb267aab8..a40bc75354a1 100644 --- a/llvm/tools/llvm-cxxfilt/Opts.td +++ b/llvm/tools/llvm-cxxfilt/Opts.td @@ -15,6 +15,7 @@ multiclass Eq { } def help : FF<"help", "Display this help">; +def quote : FF<"quote", "Quote demangled function names with \" \" if not already quoted">; defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">; def types : FF<"types", "Attempt to demangle types as well as function names">; def no_params : FF<"no-params", "Skip function parameters and return types">; diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp index f90adb6cacb9..41b379e8fd39 100644 --- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp +++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp @@ -54,6 +54,7 @@ public: } // namespace static bool ParseParams; +static bool Quote; static bool StripUnderscore; static bool Types; @@ -64,7 +65,15 @@ static void error(const Twine &Message) { exit(1); } -static std::string demangle(const std::string &Mangled) { +// Quote Undecorated with "" if asked for and not already followed by a '"'. +static std::string optionalQuote(const std::string &Undecorated, + StringRef Delimiters) { + if (Quote && (Delimiters.empty() || Delimiters[0] != '"')) + return '"' + Undecorated + '"'; + return Undecorated; +} + +static std::string demangle(const std::string &Mangled, StringRef Delimiters) { using llvm::itanium_demangle::starts_with; std::string_view DecoratedStr = Mangled; bool CanHaveLeadingDot = true; @@ -76,7 +85,7 @@ static std::string demangle(const std::string &Mangled) { std::string Result; if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot, ParseParams)) - return Result; + return optionalQuote(Result, Delimiters); std::string Prefix; char *Undecorated = nullptr; @@ -89,7 +98,8 @@ static std::string demangle(const std::string &Mangled) { Undecorated = itaniumDemangle(DecoratedStr.substr(6), ParseParams); } - Result = Undecorated ? Prefix + Undecorated : Mangled; + Result = + Undecorated ? optionalQuote(Prefix + Undecorated, Delimiters) : Mangled; free(Undecorated); return Result; } @@ -137,9 +147,10 @@ static void demangleLine(llvm::raw_ostream &OS, StringRef Mangled, bool Split) { SmallVector, 16> Words; SplitStringDelims(Mangled, Words, IsLegalItaniumChar); for (const auto &Word : Words) - Result += ::demangle(std::string(Word.first)) + Word.second.str(); + Result += + ::demangle(std::string(Word.first), Word.second) + Word.second.str(); } else - Result = ::demangle(std::string(Mangled)); + Result = ::demangle(std::string(Mangled), ""); OS << Result << '\n'; OS.flush(); } @@ -170,6 +181,8 @@ int llvm_cxxfilt_main(int argc, char **argv, const llvm::ToolContext &) { ParseParams = !Args.hasArg(OPT_no_params); + Quote = Args.hasArg(OPT_quote); + Types = Args.hasArg(OPT_types); std::vector Decorated = Args.getAllArgValues(OPT_INPUT); -- GitLab From ba5676cf91f91bbddfacae06c036cf79af0f2088 Mon Sep 17 00:00:00 2001 From: tangaac Date: Mon, 21 Oct 2024 15:58:35 +0800 Subject: [PATCH 208/511] [LoongArch] Minor refinement to monotonic atomic semantics. (#112681) Don't use "_db" version AM instructions for LoongArch atomic memory operations with monotonic semantics. --- .../Target/LoongArch/LoongArchInstrInfo.td | 75 ++++++++----------- .../ir-instruction/atomicrmw-minmax.ll | 16 ++-- .../LoongArch/ir-instruction/atomicrmw.ll | 44 +++++------ 3 files changed, 63 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 15a8f4e3c075..f97aace363cb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -2075,58 +2075,49 @@ multiclass ternary_atomic_op_failure_ord { defm atomic_cmp_swap_i32 : ternary_atomic_op_failure_ord; defm atomic_cmp_swap_i64 : ternary_atomic_op_failure_ord; +// Atomic operation for word and double word +multiclass binary_atomic_op_wd { + def : Pat<(!cast(op#"_i32_monotonic") GPR:$rj, GPR:$rk), + (!cast(inst#"_W"#signed) GPR:$rk, GPR:$rj)>; + def : Pat<(!cast(op#"_i64_monotonic") GPR:$rj, GPR:$rk), + (!cast(inst#"_D"#signed) GPR:$rk, GPR:$rj)>; + + def : Pat<(!cast(op#"_i32") GPR:$rj, GPR:$rk), + (!cast(inst#"__DB_W"#signed) GPR:$rk, GPR:$rj)>; + def : Pat<(!cast(op#"_i64") GPR:$rj, GPR:$rk), + (!cast(inst#"__DB_D"#signed) GPR:$rk, GPR:$rj)>; +} + let Predicates = [IsLA64] in { -def : AtomicPat; -def : Pat<(atomic_swap_i32 GPR:$addr, GPR:$incr), - (AMSWAP__DB_W GPR:$incr, GPR:$addr)>; -def : Pat<(atomic_swap_i64 GPR:$addr, GPR:$incr), - (AMSWAP__DB_D GPR:$incr, GPR:$addr)>; -def : Pat<(atomic_load_add_i64 GPR:$rj, GPR:$rk), - (AMADD__DB_D GPR:$rk, GPR:$rj)>; -def : AtomicPat; + +defm : binary_atomic_op_wd<"AMSWAP", "atomic_swap">; +defm : binary_atomic_op_wd<"AMADD", "atomic_load_add">; +defm : binary_atomic_op_wd<"AMAND", "atomic_load_and">; +defm : binary_atomic_op_wd<"AMOR", "atomic_load_or">; +defm : binary_atomic_op_wd<"AMXOR", "atomic_load_xor">; +defm : binary_atomic_op_wd<"AMMIN", "atomic_load_umin", "U">; +defm : binary_atomic_op_wd<"AMMAX", "atomic_load_umax", "U">; +defm : binary_atomic_op_wd<"AMMIN", "atomic_load_min">; +defm : binary_atomic_op_wd<"AMMAX", "atomic_load_max">; +def : Pat<(atomic_load_sub_i32_monotonic GPR:$rj, GPR:$rk), + (AMADD_W (SUB_W R0, GPR:$rk), GPR:$rj)>; +def : Pat<(atomic_load_sub_i64_monotonic GPR:$rj, GPR:$rk), + (AMADD_D (SUB_D R0, GPR:$rk), GPR:$rj)>; + def : Pat<(atomic_load_sub_i32 GPR:$rj, GPR:$rk), (AMADD__DB_W (SUB_W R0, GPR:$rk), GPR:$rj)>; def : Pat<(atomic_load_sub_i64 GPR:$rj, GPR:$rk), (AMADD__DB_D (SUB_D R0, GPR:$rk), GPR:$rj)>; + +def : AtomicPat; +def : AtomicPat; def : AtomicPat; defm : PseudoBinPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64>; def : AtomicPat; -def : Pat<(atomic_load_add_i32 GPR:$rj, GPR:$rk), - (AMADD__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_and_i32 GPR:$rj, GPR:$rk), - (AMAND__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_and_i64 GPR:$rj, GPR:$rk), - (AMAND__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_or_i32 GPR:$rj, GPR:$rk), - (AMOR__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_or_i64 GPR:$rj, GPR:$rk), - (AMOR__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_xor_i32 GPR:$rj, GPR:$rk), - (AMXOR__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_xor_i64 GPR:$rj, GPR:$rk), - (AMXOR__DB_D GPR:$rk, GPR:$rj)>; - -def : Pat<(atomic_load_umin_i32 GPR:$rj, GPR:$rk), - (AMMIN__DB_WU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umin_i64 GPR:$rj, GPR:$rk), - (AMMIN__DB_DU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umax_i32 GPR:$rj, GPR:$rk), - (AMMAX__DB_WU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umax_i64 GPR:$rj, GPR:$rk), - (AMMAX__DB_DU GPR:$rk, GPR:$rj)>; - -def : Pat<(atomic_load_min_i32 GPR:$rj, GPR:$rk), - (AMMIN__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_min_i64 GPR:$rj, GPR:$rk), - (AMMIN__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_max_i32 GPR:$rj, GPR:$rk), - (AMMAX__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_max_i64 GPR:$rj, GPR:$rk), - (AMMAX__DB_D GPR:$rk, GPR:$rj)>; def : AtomicPat; diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll index 2bd29c2670a6..03386514a72c 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll @@ -1368,7 +1368,7 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind { define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { ; LA64-LABEL: atomicrmw_umax_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammax_db.wu $a2, $a1, $a0 +; LA64-NEXT: ammax.wu $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b monotonic @@ -1378,7 +1378,7 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; LA64-LABEL: atomicrmw_umax_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammax_db.du $a2, $a1, $a0 +; LA64-NEXT: ammax.du $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b monotonic @@ -1445,7 +1445,7 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind { define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { ; LA64-LABEL: atomicrmw_umin_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammin_db.wu $a2, $a1, $a0 +; LA64-NEXT: ammin.wu $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b monotonic @@ -1455,7 +1455,7 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; LA64-LABEL: atomicrmw_umin_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammin_db.du $a2, $a1, $a0 +; LA64-NEXT: ammin.du $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b monotonic @@ -1531,7 +1531,7 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { ; LA64-LABEL: atomicrmw_max_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammax_db.w $a2, $a1, $a0 +; LA64-NEXT: ammax.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b monotonic @@ -1541,7 +1541,7 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; LA64-LABEL: atomicrmw_max_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammax_db.d $a2, $a1, $a0 +; LA64-NEXT: ammax.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b monotonic @@ -1617,7 +1617,7 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { ; LA64-LABEL: atomicrmw_min_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammin_db.w $a2, $a1, $a0 +; LA64-NEXT: ammin.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b monotonic @@ -1627,7 +1627,7 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; LA64-LABEL: atomicrmw_min_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: ammin_db.d $a2, $a1, $a0 +; LA64-NEXT: ammin.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b monotonic diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll index f2f459ecaa2e..b1af9c17b601 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll @@ -3982,7 +3982,7 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind { ; LA64-NEXT: ori $a2, $zero, 255 ; LA64-NEXT: sll.w $a2, $a2, $a1 ; LA64-NEXT: nor $a2, $a2, $zero -; LA64-NEXT: amand_db.w $a3, $a2, $a0 +; LA64-NEXT: amand.w $a3, $a2, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a1 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 0 monotonic @@ -4011,7 +4011,7 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind { ; LA64-NEXT: bstrins.d $a0, $zero, 1, 0 ; LA64-NEXT: ori $a2, $zero, 255 ; LA64-NEXT: sll.w $a2, $a2, $a1 -; LA64-NEXT: amor_db.w $a3, $a2, $a0 +; LA64-NEXT: amor.w $a3, $a2, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a1 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 -1 monotonic @@ -4090,7 +4090,7 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind { ; LA64-NEXT: ori $a2, $a2, 4095 ; LA64-NEXT: sll.w $a2, $a2, $a1 ; LA64-NEXT: nor $a2, $a2, $zero -; LA64-NEXT: amand_db.w $a3, $a2, $a0 +; LA64-NEXT: amand.w $a3, $a2, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a1 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 0 monotonic @@ -4121,7 +4121,7 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind { ; LA64-NEXT: lu12i.w $a2, 15 ; LA64-NEXT: ori $a2, $a2, 4095 ; LA64-NEXT: sll.w $a2, $a2, $a1 -; LA64-NEXT: amor_db.w $a3, $a2, $a0 +; LA64-NEXT: amor.w $a3, $a2, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a1 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 -1 monotonic @@ -4142,7 +4142,7 @@ define i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind { ; ; LA64-LABEL: atomicrmw_xchg_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amswap_db.w $a2, $a1, $a0 +; LA64-NEXT: amswap.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b monotonic @@ -4162,7 +4162,7 @@ define i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind { ; ; LA64-LABEL: atomicrmw_xchg_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amswap_db.d $a2, $a1, $a0 +; LA64-NEXT: amswap.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b monotonic @@ -4273,7 +4273,7 @@ define i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind { ; ; LA64-LABEL: atomicrmw_add_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amadd_db.w $a2, $a1, $a0 +; LA64-NEXT: amadd.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b monotonic @@ -4293,7 +4293,7 @@ define i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind { ; ; LA64-LABEL: atomicrmw_add_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amadd_db.d $a2, $a1, $a0 +; LA64-NEXT: amadd.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b monotonic @@ -4405,7 +4405,7 @@ define i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind { ; LA64-LABEL: atomicrmw_sub_i32_monotonic: ; LA64: # %bb.0: ; LA64-NEXT: sub.w $a2, $zero, $a1 -; LA64-NEXT: amadd_db.w $a1, $a2, $a0 +; LA64-NEXT: amadd.w $a1, $a2, $a0 ; LA64-NEXT: move $a0, $a1 ; LA64-NEXT: ret %1 = atomicrmw sub ptr %a, i32 %b monotonic @@ -4426,7 +4426,7 @@ define i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind { ; LA64-LABEL: atomicrmw_sub_i64_monotonic: ; LA64: # %bb.0: ; LA64-NEXT: sub.d $a2, $zero, $a1 -; LA64-NEXT: amadd_db.d $a1, $a2, $a0 +; LA64-NEXT: amadd.d $a1, $a2, $a0 ; LA64-NEXT: move $a0, $a1 ; LA64-NEXT: ret %1 = atomicrmw sub ptr %a, i64 %b monotonic @@ -4609,7 +4609,7 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind { ; LA64-NEXT: andi $a1, $a1, 255 ; LA64-NEXT: sll.w $a1, $a1, $a2 ; LA64-NEXT: orn $a1, $a1, $a3 -; LA64-NEXT: amand_db.w $a3, $a1, $a0 +; LA64-NEXT: amand.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw and ptr %a, i8 %b monotonic @@ -4646,7 +4646,7 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind { ; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 ; LA64-NEXT: sll.w $a1, $a1, $a2 ; LA64-NEXT: orn $a1, $a1, $a3 -; LA64-NEXT: amand_db.w $a3, $a1, $a0 +; LA64-NEXT: amand.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw and ptr %a, i16 %b monotonic @@ -4667,7 +4667,7 @@ define i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind { ; ; LA64-LABEL: atomicrmw_and_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amand_db.w $a2, $a1, $a0 +; LA64-NEXT: amand.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b monotonic @@ -4687,7 +4687,7 @@ define i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind { ; ; LA64-LABEL: atomicrmw_and_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amand_db.d $a2, $a1, $a0 +; LA64-NEXT: amand.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b monotonic @@ -4716,7 +4716,7 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind { ; LA64-NEXT: bstrins.d $a0, $zero, 1, 0 ; LA64-NEXT: andi $a1, $a1, 255 ; LA64-NEXT: sll.w $a1, $a1, $a2 -; LA64-NEXT: amor_db.w $a3, $a1, $a0 +; LA64-NEXT: amor.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw or ptr %a, i8 %b monotonic @@ -4745,7 +4745,7 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind { ; LA64-NEXT: bstrins.d $a0, $zero, 1, 0 ; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 ; LA64-NEXT: sll.w $a1, $a1, $a2 -; LA64-NEXT: amor_db.w $a3, $a1, $a0 +; LA64-NEXT: amor.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw or ptr %a, i16 %b monotonic @@ -4766,7 +4766,7 @@ define i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind { ; ; LA64-LABEL: atomicrmw_or_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amor_db.w $a2, $a1, $a0 +; LA64-NEXT: amor.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b monotonic @@ -4786,7 +4786,7 @@ define i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind { ; ; LA64-LABEL: atomicrmw_or_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amor_db.d $a2, $a1, $a0 +; LA64-NEXT: amor.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b monotonic @@ -4815,7 +4815,7 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind { ; LA64-NEXT: bstrins.d $a0, $zero, 1, 0 ; LA64-NEXT: andi $a1, $a1, 255 ; LA64-NEXT: sll.w $a1, $a1, $a2 -; LA64-NEXT: amxor_db.w $a3, $a1, $a0 +; LA64-NEXT: amxor.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw xor ptr %a, i8 %b monotonic @@ -4844,7 +4844,7 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind { ; LA64-NEXT: bstrins.d $a0, $zero, 1, 0 ; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 ; LA64-NEXT: sll.w $a1, $a1, $a2 -; LA64-NEXT: amxor_db.w $a3, $a1, $a0 +; LA64-NEXT: amxor.w $a3, $a1, $a0 ; LA64-NEXT: srl.w $a0, $a3, $a2 ; LA64-NEXT: ret %1 = atomicrmw xor ptr %a, i16 %b monotonic @@ -4865,7 +4865,7 @@ define i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind { ; ; LA64-LABEL: atomicrmw_xor_i32_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amxor_db.w $a2, $a1, $a0 +; LA64-NEXT: amxor.w $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw xor ptr %a, i32 %b monotonic @@ -4885,7 +4885,7 @@ define i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind { ; ; LA64-LABEL: atomicrmw_xor_i64_monotonic: ; LA64: # %bb.0: -; LA64-NEXT: amxor_db.d $a2, $a1, $a0 +; LA64-NEXT: amxor.d $a2, $a1, $a0 ; LA64-NEXT: move $a0, $a2 ; LA64-NEXT: ret %1 = atomicrmw xor ptr %a, i64 %b monotonic -- GitLab From c77e836123d056d98051ee980003593706f9284d Mon Sep 17 00:00:00 2001 From: Piyou Chen Date: Mon, 21 Oct 2024 16:10:22 +0800 Subject: [PATCH 209/511] [RISCV][FMV] Remove support for negative priority (#112161) Ensure that target_version and target_clones do not accept negative numbers for the priority feature. Base on discussion on https://github.com/riscv-non-isa/riscv-c-api-doc/pull/85. --- clang/lib/CodeGen/CodeGenFunction.cpp | 9 ++- clang/lib/Sema/SemaDeclAttr.cpp | 4 +- clang/test/CodeGen/attr-target-clones-riscv.c | 60 +----------------- .../test/CodeGen/attr-target-version-riscv.c | 63 +------------------ .../CodeGenCXX/attr-target-clones-riscv.cpp | 60 +----------------- .../CodeGenCXX/attr-target-version-riscv.cpp | 57 ----------------- .../test/SemaCXX/attr-target-clones-riscv.cpp | 3 + .../SemaCXX/attr-target-version-riscv.cpp | 6 ++ 8 files changed, 21 insertions(+), 241 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 2306043c90f4..465dc8c661af 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2904,19 +2904,18 @@ void CodeGenFunction::EmitMultiVersionResolver( } } -static int getPriorityFromAttrString(StringRef AttrStr) { +static unsigned getPriorityFromAttrString(StringRef AttrStr) { SmallVector Attrs; AttrStr.split(Attrs, ';'); // Default Priority is zero. - int Priority = 0; + unsigned Priority = 0; for (auto Attr : Attrs) { if (Attr.consume_front("priority=")) { - int Result; - if (!Attr.getAsInteger(0, Result)) { + unsigned Result; + if (!Attr.getAsInteger(0, Result)) Priority = Result; - } } } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 6759aae37afa..601c6f2eef1d 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3072,7 +3072,7 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D, if (HasPriority) DuplicateAttr = true; HasPriority = true; - int Digit; + unsigned Digit; if (AttrStr.getAsInteger(0, Digit)) return Diag(LiteralLoc, diag::warn_unsupported_target_attribute) << Unsupported << None << AttrStr << TargetVersion; @@ -3226,7 +3226,7 @@ bool Sema::checkTargetClonesAttrString( HasDefault = true; } else if (AttrStr.consume_front("priority=")) { IsPriority = true; - int Digit; + unsigned Digit; if (AttrStr.getAsInteger(0, Digit)) return Diag(CurLoc, diag::warn_unsupported_target_attribute) << Unsupported << None << Str << TargetClones; diff --git a/clang/test/CodeGen/attr-target-clones-riscv.c b/clang/test/CodeGen/attr-target-clones-riscv.c index 4a5dea91e227..2e8018c707d9 100644 --- a/clang/test/CodeGen/attr-target-clones-riscv.c +++ b/clang/test/CodeGen/attr-target-clones-riscv.c @@ -16,10 +16,9 @@ __attribute__((target_clones("default", "arch=+zvkt"))) int foo6(void) { return __attribute__((target_clones("default", "arch=+zbb", "arch=+zba", "arch=+zbb,+zba"))) int foo7(void) { return 2; } __attribute__((target_clones("default", "arch=+zbb;priority=2", "arch=+zba;priority=1", "arch=+zbb,+zba;priority=3"))) int foo8(void) { return 2; } __attribute__((target_clones("default", "arch=+zbb;priority=1", "priority=2;arch=+zba", "priority=3;arch=+zbb,+zba"))) int foo9(void) { return 2; } -__attribute__((target_clones("default", "arch=+zbb;priority=-1", "priority=-2;arch=+zba", "priority=3;arch=+zbb,+zba"))) int foo10(void) { return 2; } -int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() + foo8() + foo9() + foo10(); } +int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() + foo8() + foo9(); } //. // CHECK: @__riscv_feature_bits = external dso_local global { i32, [2 x i64] } @@ -32,7 +31,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK: @foo7.ifunc = weak_odr alias i32 (), ptr @foo7 // CHECK: @foo8.ifunc = weak_odr alias i32 (), ptr @foo8 // CHECK: @foo9.ifunc = weak_odr alias i32 (), ptr @foo9 -// CHECK: @foo10.ifunc = weak_odr alias i32 (), ptr @foo10 // CHECK: @foo1 = weak_odr ifunc i32 (), ptr @foo1.resolver // CHECK: @foo2 = weak_odr ifunc i32 (), ptr @foo2.resolver // CHECK: @foo3 = weak_odr ifunc i32 (), ptr @foo3.resolver @@ -42,7 +40,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK: @foo7 = weak_odr ifunc i32 (), ptr @foo7.resolver // CHECK: @foo8 = weak_odr ifunc i32 (), ptr @foo8.resolver // CHECK: @foo9 = weak_odr ifunc i32 (), ptr @foo9.resolver -// CHECK: @foo10 = weak_odr ifunc i32 (), ptr @foo10.resolver //. // CHECK-LABEL: define dso_local signext i32 @foo1.default( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -347,57 +344,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK-NEXT: ret ptr @foo9.default // // -// CHECK-LABEL: define dso_local signext i32 @foo10.default( -// CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo10._zbb( -// CHECK-SAME: ) #[[ATTR2]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo10._zba( -// CHECK-SAME: ) #[[ATTR6]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo10._zba_zbb( -// CHECK-SAME: ) #[[ATTR7]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define weak_odr ptr @foo10.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 402653184 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 402653184 -// CHECK-NEXT: br i1 [[TMP2]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @foo10._zba_zbb -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 268435456 -// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 268435456 -// CHECK-NEXT: br i1 [[TMP5]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @foo10._zbb -// CHECK: resolver_else2: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 134217728 -// CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 134217728 -// CHECK-NEXT: br i1 [[TMP8]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @foo10._zba -// CHECK: resolver_else4: -// CHECK-NEXT: ret ptr @foo10.default -// -// // CHECK-LABEL: define dso_local signext i32 @bar( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -418,9 +364,7 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD11]], [[CALL12]] // CHECK-NEXT: [[CALL14:%.*]] = call signext i32 @foo9() // CHECK-NEXT: [[ADD15:%.*]] = add nsw i32 [[ADD13]], [[CALL14]] -// CHECK-NEXT: [[CALL16:%.*]] = call signext i32 @foo10() -// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[ADD15]], [[CALL16]] -// CHECK-NEXT: ret i32 [[ADD17]] +// CHECK-NEXT: ret i32 [[ADD15]] // //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i" } diff --git a/clang/test/CodeGen/attr-target-version-riscv.c b/clang/test/CodeGen/attr-target-version-riscv.c index 7d0e61e61542..362b9ad32365 100644 --- a/clang/test/CodeGen/attr-target-version-riscv.c +++ b/clang/test/CodeGen/attr-target-version-riscv.c @@ -32,12 +32,7 @@ __attribute__((target_version("arch=+zbb;priority=9"))) int foo7(void) { return __attribute__((target_version("arch=+zbb,+zba;priority=10"))) int foo7(void) { return 1; } __attribute__((target_version("default"))) int foo7(void) { return 1; } -__attribute__((target_version("priority=-1;arch=+zba"))) int foo8(void) { return 1; } -__attribute__((target_version("arch=+zbb;priority=-2"))) int foo8(void) { return 1; } -__attribute__((target_version("arch=+zbb,+zba;priority=3"))) int foo8(void) { return 1; } -__attribute__((target_version("default"))) int foo8(void) { return 1; } - -int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() + foo8(); } +int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7(); } //. // CHECK: @__riscv_feature_bits = external dso_local global { i32, [2 x i64] } // CHECK: @foo1 = weak_odr ifunc i32 (), ptr @foo1.resolver @@ -47,7 +42,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK: @foo5 = weak_odr ifunc i32 (), ptr @foo5.resolver // CHECK: @foo6 = weak_odr ifunc i32 (), ptr @foo6.resolver // CHECK: @foo7 = weak_odr ifunc i32 (), ptr @foo7.resolver -// CHECK: @foo8 = weak_odr ifunc i32 (), ptr @foo8.resolver //. // CHECK-LABEL: define dso_local signext i32 @foo1._v( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -193,30 +187,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: define dso_local signext i32 @foo8._zba( -// CHECK-SAME: ) #[[ATTR5]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo8._zbb( -// CHECK-SAME: ) #[[ATTR2]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo8._zba_zbb( -// CHECK-SAME: ) #[[ATTR6]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local signext i32 @foo8.default( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// // CHECK-LABEL: define dso_local signext i32 @bar( // CHECK-SAME: ) #[[ATTR1]] { // CHECK-NEXT: entry: @@ -233,9 +203,7 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD7]], [[CALL8]] // CHECK-NEXT: [[CALL10:%.*]] = call signext i32 @foo7() // CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] -// CHECK-NEXT: [[CALL12:%.*]] = call signext i32 @foo8() -// CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD11]], [[CALL12]] -// CHECK-NEXT: ret i32 [[ADD13]] +// CHECK-NEXT: ret i32 [[ADD11]] // // // CHECK-LABEL: define weak_odr ptr @foo1.resolver() comdat { @@ -398,33 +366,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7() // CHECK: resolver_else4: // CHECK-NEXT: ret ptr @foo7.default // -// -// CHECK-LABEL: define weak_odr ptr @foo8.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 402653184 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 402653184 -// CHECK-NEXT: br i1 [[TMP2]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @foo8._zba_zbb -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 134217728 -// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 134217728 -// CHECK-NEXT: br i1 [[TMP5]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @foo8._zba -// CHECK: resolver_else2: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 268435456 -// CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 268435456 -// CHECK-NEXT: br i1 [[TMP8]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @foo8._zbb -// CHECK: resolver_else4: -// CHECK-NEXT: ret ptr @foo8.default -// //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+i,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i" } diff --git a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp index d53e5c0520e6..13a0226ce541 100644 --- a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp +++ b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp @@ -16,9 +16,8 @@ __attribute__((target_clones("default", "arch=+zvkt"))) int foo6(void) { return __attribute__((target_clones("default", "arch=+zbb", "arch=+zba", "arch=+zbb,+zba"))) int foo7(void) { return 2; } __attribute__((target_clones("default", "arch=+zbb;priority=2", "arch=+zba;priority=1", "arch=+zbb,+zba;priority=3"))) int foo8(void) { return 2; } __attribute__((target_clones("default", "arch=+zbb;priority=1", "priority=2;arch=+zba", "priority=3;arch=+zbb,+zba"))) int foo9(void) { return 2; } -__attribute__((target_clones("default", "arch=+zbb;priority=-1", "priority=-2;arch=+zba", "priority=3;arch=+zbb,+zba"))) int foo10(void) { return 2; } -int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + foo8() + foo9() + foo10(); } +int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + foo8() + foo9(); } //. // CHECK: @__riscv_feature_bits = external dso_local global { i32, [2 x i64] } @@ -31,7 +30,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + // CHECK: @_Z4foo7v.ifunc = weak_odr alias i32 (), ptr @_Z4foo7v // CHECK: @_Z4foo8v.ifunc = weak_odr alias i32 (), ptr @_Z4foo8v // CHECK: @_Z4foo9v.ifunc = weak_odr alias i32 (), ptr @_Z4foo9v -// CHECK: @_Z5foo10v.ifunc = weak_odr alias i32 (), ptr @_Z5foo10v // CHECK: @_Z4foo1v = weak_odr ifunc i32 (), ptr @_Z4foo1v.resolver // CHECK: @_Z4foo2v = weak_odr ifunc i32 (), ptr @_Z4foo2v.resolver // CHECK: @_Z4foo3v = weak_odr ifunc i32 (), ptr @_Z4foo3v.resolver @@ -41,7 +39,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + // CHECK: @_Z4foo7v = weak_odr ifunc i32 (), ptr @_Z4foo7v.resolver // CHECK: @_Z4foo8v = weak_odr ifunc i32 (), ptr @_Z4foo8v.resolver // CHECK: @_Z4foo9v = weak_odr ifunc i32 (), ptr @_Z4foo9v.resolver -// CHECK: @_Z5foo10v = weak_odr ifunc i32 (), ptr @_Z5foo10v.resolver //. // CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo1v.default( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -346,57 +343,6 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + // CHECK-NEXT: ret ptr @_Z4foo9v.default // // -// CHECK-LABEL: define dso_local noundef signext i32 @_Z5foo10v.default( -// CHECK-SAME: ) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z5foo10v._zbb( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z5foo10v._zba( -// CHECK-SAME: ) #[[ATTR5]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z5foo10v._zba_zbb( -// CHECK-SAME: ) #[[ATTR6]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK-LABEL: define weak_odr ptr @_Z5foo10v.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 402653184 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 402653184 -// CHECK-NEXT: br i1 [[TMP2]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z5foo10v._zba_zbb -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 268435456 -// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 268435456 -// CHECK-NEXT: br i1 [[TMP5]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @_Z5foo10v._zbb -// CHECK: resolver_else2: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 134217728 -// CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 134217728 -// CHECK-NEXT: br i1 [[TMP8]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @_Z5foo10v._zba -// CHECK: resolver_else4: -// CHECK-NEXT: ret ptr @_Z5foo10v.default -// -// // CHECK-LABEL: define dso_local noundef signext i32 @_Z3barv( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: entry: @@ -417,9 +363,7 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() + // CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD11]], [[CALL12]] // CHECK-NEXT: [[CALL14:%.*]] = call noundef signext i32 @_Z4foo9v() // CHECK-NEXT: [[ADD15:%.*]] = add nsw i32 [[ADD13]], [[CALL14]] -// CHECK-NEXT: [[CALL16:%.*]] = call noundef signext i32 @_Z5foo10v() -// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[ADD15]], [[CALL16]] -// CHECK-NEXT: ret i32 [[ADD17]] +// CHECK-NEXT: ret i32 [[ADD15]] // //. // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zmmul" } diff --git a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp index 9078f6541b3d..51fae0902ab7 100644 --- a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp +++ b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp @@ -32,11 +32,6 @@ __attribute__((target_version("arch=+zbb;priority=9"))) int foo7(void) { return __attribute__((target_version("arch=+zbb,+zba;priority=10"))) int foo7(void) { return 1; } __attribute__((target_version("default"))) int foo7(void) { return 1; } -__attribute__((target_version("priority=-1;arch=+zba"))) int foo8(void) { return 1; } -__attribute__((target_version("arch=+zbb;priority=-2"))) int foo8(void) { return 1; } -__attribute__((target_version("arch=+zbb,+zba;priority=3"))) int foo8(void) { return 1; } -__attribute__((target_version("default"))) int foo8(void) { return 1; } - int bar() { return foo1() + foo2() + foo3(); } //. // CHECK: @__riscv_feature_bits = external dso_local global { i32, [2 x i64] } @@ -47,7 +42,6 @@ int bar() { return foo1() + foo2() + foo3(); } // CHECK: @_Z4foo5v = weak_odr ifunc i32 (), ptr @_Z4foo5v.resolver // CHECK: @_Z4foo6v = weak_odr ifunc i32 (), ptr @_Z4foo6v.resolver // CHECK: @_Z4foo7v = weak_odr ifunc i32 (), ptr @_Z4foo7v.resolver -// CHECK: @_Z4foo8v = weak_odr ifunc i32 (), ptr @_Z4foo8v.resolver //. // CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo1v._v( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -193,30 +187,6 @@ int bar() { return foo1() + foo2() + foo3(); } // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo8v._zba( -// CHECK-SAME: ) #[[ATTR4]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo8v._zbb( -// CHECK-SAME: ) #[[ATTR2]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo8v._zba_zbb( -// CHECK-SAME: ) #[[ATTR5]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK-LABEL: define dso_local noundef signext i32 @_Z4foo8v.default( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// // CHECK-LABEL: define dso_local noundef signext i32 @_Z3barv( // CHECK-SAME: ) #[[ATTR1]] { // CHECK-NEXT: entry: @@ -388,33 +358,6 @@ int bar() { return foo1() + foo2() + foo3(); } // CHECK: resolver_else4: // CHECK-NEXT: ret ptr @_Z4foo7v.default // -// -// CHECK-LABEL: define weak_odr ptr @_Z4foo8v.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 402653184 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 402653184 -// CHECK-NEXT: br i1 [[TMP2]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z4foo8v._zba_zbb -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 134217728 -// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 134217728 -// CHECK-NEXT: br i1 [[TMP5]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @_Z4foo8v._zba -// CHECK: resolver_else2: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 -// CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 268435456 -// CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 268435456 -// CHECK-NEXT: br i1 [[TMP8]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @_Z4foo8v._zbb -// CHECK: resolver_else4: -// CHECK-NEXT: ret ptr @_Z4foo8v.default -// //. // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+i,+m,+v,+zicsr,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" } // CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zmmul" } diff --git a/clang/test/SemaCXX/attr-target-clones-riscv.cpp b/clang/test/SemaCXX/attr-target-clones-riscv.cpp index 4425dd2108a6..102bb4b9b3d2 100644 --- a/clang/test/SemaCXX/attr-target-clones-riscv.cpp +++ b/clang/test/SemaCXX/attr-target-clones-riscv.cpp @@ -33,6 +33,9 @@ void __attribute__((target_clones("default;priority=2", "arch=+c"))) UnsupportDe // expected-warning@+1 {{unsupported 'priority=2;default' in the 'target_clones' attribute string; 'target_clones' attribute ignored}} void __attribute__((target_clones("priority=2;default", "arch=+c"))) UnsupportDefaultPriority2() {} +// expected-warning@+1 {{unsupported 'arch=+c;priority=-1' in the 'target_clones' attribute string; 'target_clones' attribute ignored}} +void __attribute__((target_clones("default", "arch=+c;priority=-1"))) UnsupportNegativePriority() {} + // expected-warning@+1 {{unsupported 'arch=+c,zbb' in the 'target_clones' attribute string; 'target_clones' attribute ignored}} void __attribute__((target_clones("default", "arch=+c,zbb"))) WithoutAddSign() {} diff --git a/clang/test/SemaCXX/attr-target-version-riscv.cpp b/clang/test/SemaCXX/attr-target-version-riscv.cpp index 785a3c6abafe..f7e6811533ac 100644 --- a/clang/test/SemaCXX/attr-target-version-riscv.cpp +++ b/clang/test/SemaCXX/attr-target-version-riscv.cpp @@ -111,3 +111,9 @@ __attribute__((target_version("default"))) int invalidVerson4(void) { return 2; __attribute__((target_version("priority=1"))) int prioriyWithoutArch(void) { return 2; } // expected-error@+1 {{redefinition of 'prioriyWithoutArch'}} __attribute__((target_version("default"))) int prioriyWithoutArch(void) { return 2; } + +// expected-warning@+2 {{unsupported '-1' in the 'target_version' attribute string; 'target_version' attribute ignored}} +// expected-note@+1 {{previous definition is here}} +__attribute__((target_version("arch=+c;priority=-1"))) int UnsupportNegativePriority(void) { return 2; } +// expected-error@+1 {{redefinition of 'UnsupportNegativePriority'}} +__attribute__((target_version("default"))) int UnsupportNegativePriority(void) { return 2; } -- GitLab From 6360652e9f5b5975d71c619abd981f102eeccec8 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 21 Oct 2024 13:44:09 +0530 Subject: [PATCH 210/511] Reland [AMDGPU] Serialize WWM_REG vreg flag (#110229) (#112492) A reland but not an exact copy as `VRegInfo.Flags` from the parser is now an int8 instead of a vector; so only need to copy over the value. --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +++++++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 10 ++++++++++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 8 ++++++++ .../MIR/AMDGPU/machine-function-info-no-ir.mir | 15 +++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e4cc522194f2..4d94faf5facf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1718,6 +1718,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->reserveWWMRegister(ParsedReg); } + for (const auto &[_, Info] : PFS.VRegInfosNamed) { + MFI->setFlag(Info->VReg, Info->Flags); + } + for (const auto &[_, Info] : PFS.VRegInfos) { + MFI->setFlag(Info->VReg, Info->Flags); + } + auto parseAndCheckArgument = [&](const std::optional &A, const TargetRegisterClass &RC, ArgDescriptor &Arg, unsigned UserSGPRs, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8de16974a3e7..1929df12124f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3860,3 +3860,13 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, return getHWRegIndex(Reg) + 1; return 0; } + +SmallVector +SIRegisterInfo::getVRegFlagsOfReg(Register Reg, + const MachineFunction &MF) const { + SmallVector RegFlags; + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + RegFlags.push_back("WWM_REG"); + return RegFlags; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index e12a41371c7f..8e481e3ac230 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -462,6 +462,14 @@ public: // Does not go inside function calls. unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const; + + std::optional getVRegFlagValue(StringRef Name) const override { + return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG + : std::optional{}; + } + + SmallVector + getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; }; namespace AMDGPU { diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index ebbb89b7816c..51795a4fea51 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -578,3 +578,18 @@ body: | SI_RETURN ... +--- +name: vregs +# FULL: registers: +# FULL-NEXT: - { id: 0, class: vgpr_32, preferred-register: '$vgpr1', flags: [ WWM_REG ] } +# FULL-NEXT: - { id: 1, class: sgpr_64, preferred-register: '$sgpr0_sgpr1', flags: [ ] } +# FULL-NEXT: - { id: 2, class: sgpr_64, preferred-register: '', flags: [ ] } +registers: + - { id: 0, class: vgpr_32, preferred-register: $vgpr1, flags: [ WWM_REG ]} + - { id: 1, class: sgpr_64, preferred-register: $sgpr0_sgpr1 } + - { id: 2, class: sgpr_64, flags: [ ] } +body: | + bb.0: + %2:sgpr_64 = COPY %1 + %1:sgpr_64 = COPY %0 +... -- GitLab From 911a6f2fcc719c46b5b392823473ba0bb5b1f4e1 Mon Sep 17 00:00:00 2001 From: Liu An Date: Mon, 21 Oct 2024 17:04:55 +0800 Subject: [PATCH 211/511] [lldb][LoongArch64] Add support for LoongArch64 in elf-core for lldb (#112296) When using the lldb command 'target create --core' on the LoongArch64 architecture, this part of the code is required. --- .../RegisterContextPOSIX_loongarch64.cpp | 2 +- .../Plugins/Process/elf-core/CMakeLists.txt | 1 + .../RegisterContextPOSIXCore_loongarch64.cpp | 84 +++++++++++++ .../RegisterContextPOSIXCore_loongarch64.h | 58 +++++++++ .../Process/elf-core/ThreadElfCore.cpp | 6 + .../postmortem/elf-core/TestLinuxCore.py | 112 ++++++++++++++++++ .../elf-core/linux-loongarch64.core | Bin 0 -> 36044 bytes .../postmortem/elf-core/linux-loongarch64.out | Bin 0 -> 2944 bytes 8 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.cpp create mode 100644 lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.h create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.core create mode 100755 lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.out diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp index a48a58f28f7a..49f371fb949b 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.cpp @@ -58,7 +58,7 @@ RegisterContextPOSIX_loongarch64::GetRegisterInfoAtIndex(size_t reg) { } size_t RegisterContextPOSIX_loongarch64::GetRegisterSetCount() { - return m_register_info_up->GetRegisterCount(); + return m_register_info_up->GetRegisterSetCount(); } const lldb_private::RegisterSet * diff --git a/lldb/source/Plugins/Process/elf-core/CMakeLists.txt b/lldb/source/Plugins/Process/elf-core/CMakeLists.txt index 72925c835b5c..7473fa8d41cc 100644 --- a/lldb/source/Plugins/Process/elf-core/CMakeLists.txt +++ b/lldb/source/Plugins/Process/elf-core/CMakeLists.txt @@ -10,6 +10,7 @@ add_lldb_library(lldbPluginProcessElfCore PLUGIN RegisterContextPOSIXCore_s390x.cpp RegisterContextPOSIXCore_x86_64.cpp RegisterContextPOSIXCore_riscv64.cpp + RegisterContextPOSIXCore_loongarch64.cpp RegisterUtilities.cpp LINK_LIBS diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.cpp new file mode 100644 index 000000000000..f0500948a6ab --- /dev/null +++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.cpp @@ -0,0 +1,84 @@ +//===-- RegisterContextPOSIXCore_loongarch64.cpp --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RegisterContextPOSIXCore_loongarch64.h" + +#include "lldb/Utility/DataBufferHeap.h" + +using namespace lldb_private; + +std::unique_ptr +RegisterContextCorePOSIX_loongarch64::Create(Thread &thread, + const ArchSpec &arch, + const DataExtractor &gpregset, + llvm::ArrayRef notes) { + return std::unique_ptr( + new RegisterContextCorePOSIX_loongarch64( + thread, + std::make_unique(arch, Flags()), + gpregset, notes)); +} + +RegisterContextCorePOSIX_loongarch64::RegisterContextCorePOSIX_loongarch64( + Thread &thread, + std::unique_ptr register_info, + const DataExtractor &gpregset, llvm::ArrayRef notes) + : RegisterContextPOSIX_loongarch64(thread, std::move(register_info)) { + + m_gpr.SetData(std::make_shared(gpregset.GetDataStart(), + gpregset.GetByteSize())); + m_gpr.SetByteOrder(gpregset.GetByteOrder()); + + ArchSpec arch = m_register_info_up->GetTargetArchitecture(); + DataExtractor fpregset = getRegset(notes, arch.GetTriple(), FPR_Desc); + m_fpr.SetData(std::make_shared(fpregset.GetDataStart(), + fpregset.GetByteSize())); + m_fpr.SetByteOrder(fpregset.GetByteOrder()); +} + +RegisterContextCorePOSIX_loongarch64::~RegisterContextCorePOSIX_loongarch64() = + default; + +bool RegisterContextCorePOSIX_loongarch64::ReadGPR() { return true; } + +bool RegisterContextCorePOSIX_loongarch64::ReadFPR() { return true; } + +bool RegisterContextCorePOSIX_loongarch64::WriteGPR() { + assert(false && "Writing registers is not allowed for core dumps"); + return false; +} + +bool RegisterContextCorePOSIX_loongarch64::WriteFPR() { + assert(false && "Writing registers is not allowed for core dumps"); + return false; +} + +bool RegisterContextCorePOSIX_loongarch64::ReadRegister( + const RegisterInfo *reg_info, RegisterValue &value) { + const uint8_t *src = nullptr; + lldb::offset_t offset = reg_info->byte_offset; + + if (IsGPR(reg_info->kinds[lldb::eRegisterKindLLDB])) { + src = m_gpr.GetDataStart(); + } else if (IsFPR(reg_info->kinds[lldb::eRegisterKindLLDB])) { + src = m_fpr.GetDataStart(); + offset -= GetGPRSize(); + } else { + return false; + } + + Status error; + value.SetFromMemoryData(*reg_info, src + offset, reg_info->byte_size, + lldb::eByteOrderLittle, error); + return error.Success(); +} + +bool RegisterContextCorePOSIX_loongarch64::WriteRegister( + const RegisterInfo *reg_info, const RegisterValue &value) { + return false; +} diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.h b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.h new file mode 100644 index 000000000000..7bb53bd64203 --- /dev/null +++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_loongarch64.h @@ -0,0 +1,58 @@ +//===-- RegisterContextPOSIXCore_loongarch64.h -------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_PROCESS_ELF_CORE_REGISTERCONTEXTPOSIXCORE_LOONGARCH64_H +#define LLDB_SOURCE_PLUGINS_PROCESS_ELF_CORE_REGISTERCONTEXTPOSIXCORE_LOONGARCH64_H + +#include "Plugins/Process/Utility/RegisterContextPOSIX_loongarch64.h" +#include "Plugins/Process/Utility/RegisterInfoPOSIX_loongarch64.h" + +#include "Plugins/Process/elf-core/RegisterUtilities.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/DataExtractor.h" +#include "lldb/Utility/RegisterValue.h" + +#include + +class RegisterContextCorePOSIX_loongarch64 + : public RegisterContextPOSIX_loongarch64 { +public: + static std::unique_ptr + Create(lldb_private::Thread &thread, const lldb_private::ArchSpec &arch, + const lldb_private::DataExtractor &gpregset, + llvm::ArrayRef notes); + + ~RegisterContextCorePOSIX_loongarch64() override; + + bool ReadRegister(const lldb_private::RegisterInfo *reg_info, + lldb_private::RegisterValue &value) override; + + bool WriteRegister(const lldb_private::RegisterInfo *reg_info, + const lldb_private::RegisterValue &value) override; + +protected: + RegisterContextCorePOSIX_loongarch64( + lldb_private::Thread &thread, + std::unique_ptr register_info, + const lldb_private::DataExtractor &gpregset, + llvm::ArrayRef notes); + + bool ReadGPR() override; + + bool ReadFPR() override; + + bool WriteGPR() override; + + bool WriteFPR() override; + +private: + lldb_private::DataExtractor m_gpr; + lldb_private::DataExtractor m_fpr; +}; + +#endif // LLDB_SOURCE_PLUGINS_PROCESS_ELF_CORE_REGISTERCONTEXTPOSIXCORE_LOONGARCH64_H diff --git a/lldb/source/Plugins/Process/elf-core/ThreadElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ThreadElfCore.cpp index 52b96052bdbe..f2838087298e 100644 --- a/lldb/source/Plugins/Process/elf-core/ThreadElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ThreadElfCore.cpp @@ -33,6 +33,7 @@ #include "RegisterContextLinuxCore_x86_64.h" #include "RegisterContextPOSIXCore_arm.h" #include "RegisterContextPOSIXCore_arm64.h" +#include "RegisterContextPOSIXCore_loongarch64.h" #include "RegisterContextPOSIXCore_mips64.h" #include "RegisterContextPOSIXCore_powerpc.h" #include "RegisterContextPOSIXCore_ppc64le.h" @@ -171,6 +172,7 @@ ThreadElfCore::CreateRegisterContextForFrame(StackFrame *frame) { if (!reg_interface && arch.GetMachine() != llvm::Triple::aarch64 && arch.GetMachine() != llvm::Triple::arm && + arch.GetMachine() != llvm::Triple::loongarch64 && arch.GetMachine() != llvm::Triple::riscv64) { LLDB_LOGF(log, "elf-core::%s:: Architecture(%d) or OS(%d) not supported", __FUNCTION__, arch.GetMachine(), arch.GetTriple().getOS()); @@ -187,6 +189,10 @@ ThreadElfCore::CreateRegisterContextForFrame(StackFrame *frame) { *this, std::make_unique(arch), m_gpregset_data, m_notes); break; + case llvm::Triple::loongarch64: + m_thread_reg_ctx_sp = RegisterContextCorePOSIX_loongarch64::Create( + *this, arch, m_gpregset_data, m_notes); + break; case llvm::Triple::riscv64: m_thread_reg_ctx_sp = RegisterContextCorePOSIX_riscv64::Create( *this, arch, m_gpregset_data, m_notes); diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py index 7e8531c88bf3..376d6492d83b 100644 --- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py +++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py @@ -23,6 +23,7 @@ class LinuxCoreTestCase(TestBase): _ppc64le_pid = 28147 _riscv64_gpr_fpr_pid = 1089 _riscv64_gpr_only_pid = 97 + _loongarch64_pid = 456735 _aarch64_regions = 4 _i386_regions = 4 @@ -30,6 +31,7 @@ class LinuxCoreTestCase(TestBase): _s390x_regions = 2 _ppc64le_regions = 2 _riscv64_regions = 4 + _loongarch64_regions = 4 @skipIfLLVMTargetMissing("AArch64") def test_aarch64(self): @@ -82,6 +84,16 @@ class LinuxCoreTestCase(TestBase): "a.out", ) + @skipIfLLVMTargetMissing("LoongArch") + def test_loongarch64(self): + """Test that lldb can read the process information from an loongarch64 linux core file.""" + self.do_test( + "linux-loongarch64", + self._loongarch64_pid, + self._loongarch64_regions, + "a.out", + ) + @skipIfLLVMTargetMissing("X86") def test_same_pid_running(self): """Test that we read the information from the core correctly even if we have a running @@ -833,6 +845,106 @@ class LinuxCoreTestCase(TestBase): substrs=["registers were unavailable"], ) + @skipIfLLVMTargetMissing("LoongArch") + def test_loongarch64_regs(self): + # check registers using 64 bit LoongArch core file containing GP and FP registers + target = self.dbg.CreateTarget(None) + self.assertTrue(target, VALID_TARGET) + process = target.LoadCore("linux-loongarch64.core") + + values = {} + values["r0"] = "0x0000000000000000" + values["r1"] = "0x000000012000016c" + values["r2"] = "0x0000000000000000" + values["r3"] = "0x00007ffffb8249e0" + values["r4"] = "0x0000000000000000" + values["r5"] = "0x000000012000010c" + values["r6"] = "0x0000000000000000" + values["r7"] = "0x0000000000000000" + values["r8"] = "0x0000000000000000" + values["r9"] = "0x0000000000000000" + values["r10"] = "0x0000000000000000" + values["r11"] = "0x00000000000000dd" + values["r12"] = "0x0000000000000000" + values["r13"] = "0x000000000000002f" + values["r14"] = "0x0000000000000000" + values["r15"] = "0x0000000000000000" + values["r16"] = "0x0000000000000000" + values["r17"] = "0x0000000000000000" + values["r18"] = "0x0000000000000000" + values["r19"] = "0x0000000000000000" + values["r20"] = "0x0000000000000000" + values["r21"] = "0x0000000000000000" + values["r22"] = "0x00007ffffb824a10" + values["r23"] = "0x0000000000000000" + values["r24"] = "0x0000000000000000" + values["r25"] = "0x0000000000000000" + values["r26"] = "0x0000000000000000" + values["r27"] = "0x0000000000000000" + values["r28"] = "0x0000000000000000" + values["r29"] = "0x0000000000000000" + values["r30"] = "0x0000000000000000" + values["r31"] = "0x0000000000000000" + values["orig_a0"] = "0x0000555556b62d50" + values["pc"] = "0x000000012000012c" + + fpr_values = {} + fpr_values["f0"] = "0x00000000ffffff05" + fpr_values["f1"] = "0x2525252525252525" + fpr_values["f2"] = "0x2525252525560005" + fpr_values["f3"] = "0x000000000000ffff" + fpr_values["f4"] = "0x0000000000000000" + fpr_values["f5"] = "0x0000000000000008" + fpr_values["f6"] = "0x0f0e0d0c0b0a0908" + fpr_values["f7"] = "0xffffffffffffffff" + fpr_values["f8"] = "0x6261747563657845" + fpr_values["f9"] = "0x766173206562206c" + fpr_values["f10"] = "0xffffffffffffffff" + fpr_values["f11"] = "0xffffffffffffffff" + fpr_values["f12"] = "0xffffffffffffffff" + fpr_values["f13"] = "0xffffffffffffffff" + fpr_values["f14"] = "0xffffffffffffffff" + fpr_values["f15"] = "0xffffffffffffffff" + fpr_values["f16"] = "0xffffffffffffffff" + fpr_values["f17"] = "0xffffffffffffffff" + fpr_values["f18"] = "0xffffffffffffffff" + fpr_values["f19"] = "0xffffffffffffffff" + fpr_values["f20"] = "0xffffffffffffffff" + fpr_values["f21"] = "0xffffffffffffffff" + fpr_values["f22"] = "0xffffffffffffffff" + fpr_values["f23"] = "0xffffffffffffffff" + fpr_values["f24"] = "0xffffffffffffffff" + fpr_values["f25"] = "0xffffffffffffffff" + fpr_values["f26"] = "0xffffffffffffffff" + fpr_values["f27"] = "0xffffffffffffffff" + fpr_values["f28"] = "0xffffffffffffffff" + fpr_values["f29"] = "0xffffffffffffffff" + fpr_values["f30"] = "0xffffffffffffffff" + fpr_values["f31"] = "0xffffffffffffffff" + fpr_values["fcc0"] = "0x01" + fpr_values["fcc1"] = "0x00" + fpr_values["fcc2"] = "0x01" + fpr_values["fcc3"] = "0x01" + fpr_values["fcc4"] = "0x01" + fpr_values["fcc5"] = "0x01" + fpr_values["fcc6"] = "0x00" + fpr_values["fcc7"] = "0x01" + fpr_values["fcsr"] = "0x00000000" + + for regname, value in values.items(): + self.expect( + "register read {}".format(regname), + substrs=["{} = {}".format(regname, value)], + ) + + for regname, value in fpr_values.items(): + self.expect( + "register read {}".format(regname), + substrs=["{} = {}".format(regname, value)], + ) + + self.expect("register read --all") + def test_get_core_file_api(self): """ Test SBProcess::GetCoreFile() API can successfully get the core file. diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.core b/lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.core new file mode 100644 index 0000000000000000000000000000000000000000..8738950b8e08dcc474939d405c459cce986d66c0 GIT binary patch literal 36044 zcmeI4TWnlM8GvW@c-L#Muer2#Lh)K8s*UR11gww-xjn6&1XosT0tpeSBs*LFEArCx-2SAhu)FMzR6-Y=9UH*UO|JS?6 zYXVLyCGd}Q&OiTc&YW-N%}C^9kxEVW zx2THaG2FD_HvI9HCfFX37!$#i8s4C|mpAHbwrX{%yR8!Q%(K_YuJ+w%$zwZ9*Tyya zS)<;q2l6Pc0g7~dx7Q&Bv!)+AbsU6>;f+djooyrkh^YB*M-)r8-%Or)s5N5au2wy# zRSVnKXIq)|1pKYpTeaT?Ov!-0Q+42x)_(1@9nPDd_}Q7$reyT8X%(@)kJp;xT6FC5qJwg`!9#jV0b>x}_@4#~NI-Cucv!ed9UE{lBFL z>>wHYG!GBQ@0t5F%I(m1HbDAX$X3j^k5$*|gI1^vNo0@3Rd&vsZyxlcXoP~##KLi% zR&Cs)jhd}=kzS)Y#LRFEvGiDB#uTf)0lhDv-%UD;VXC|`XXpM1>2I%DotMw+pHKR^#ffyW9hMoTQ+%9?iZjt}Nce`B}OFx8w5D)@FKnMr{As_^VfDjM@LO=)z0U;m+gn$qb z0zyCt2mv7=1cZPP5CTF#2nYcoAOwVf5D)@FKnMr{As_^VfDjM@LO=-IZUoi>`~O3< z|G!52U0v=lZJ?M29ryn`aPNcNuDC^!_ta8o+-zV}b}#89Zt#S{64Fe;eYtqOX;qXa zkL3Zfv&2{|pZr55WA=Pm^`>{;KaBZ%;KFUYCm%hn?s?{SLkr)1^Ox5is}KHa=<{#< z?bAPeG40kvtKV54UtM1hy_Bkt_dIzibnV3l$FHU8W4$lbLSNBp9Nyl6n6Wg(#9?bD z=6IM>+su3UVaWXk z;F<%!3ljaGxTK#BVMONVGaU0cxRxQt?3teFi#K8(w=pNDr;o+QX8csvE5r{^CO(ol zG%-0&H3geYl^t|DW%zV~AK??*ls|zjth5R4=mF&(48ncc?J~o8aOmDhjpnE6Bbu(B z#rp}>k1}+ns3JOo8|ph3i$$e7d)>f<)>og!K)6)r-KKiwYzSv^=YgN1##H>>nA_yE+G8|9Ci|9~_2+7&H;PcNop?(8Xa5 z(UAix7JDG(z)suFgofv|iHYvj_nP3Yu)-&FZwZ{uoWb_(a}IWv6m0M9VDk{AU~l`Rg(e5G5=(2o7` zgC0lmX$q|#2RFF~3eK|*^;dHqK(h;5hTo{h`|>x33kTYRpZQ*}?*e-*uwLN$IRHIX zuKnEs)f8Mq+u1*~jh*KME(N&KEG(qTWtuMwe#WbKYQZmL;6RV^6L~nB0A!_P@o zX-e={BbbllkD|OMHVYuoYYXp ztZ3Z1-)zA1XWHd2Q~uXEkYq4F;1*Q~2mv7=1cZPP5CTF#2nYcoAOwVf5D)@FKnMr{ zAs_^VfDjM@LO=)z0U;m+gn$qb0zyCt2mv7=1cZPP5CTF#2nYcoAOwVf5D)@FKnMr{ zAs_^VfDjM@LO=)z0U;m+{yz!e@d2i|k6hlUuWg>W!}fq&JV$ZgZ*#5)#4ZHl8)pmr zq-xt|{|ltQO7f2+|4DM!s*T@Aa-8Hbl1n5nko+3S*GOI``3A{%N$&chZTBF_G|3Aj zze(~rlD{DNn=jeCKa*5zR^LzZev(g-e2L_*N%DJ-a%Fwr4dgvQ`WhW#z~^8n*E;Zg z0W29S8fOi-%AQ-o=V2TrKcC>jXA;mc1mxX3|9}mAB1?_NGe(ZLr~y1T0SoKBR&D5b zrUDlB!}&xl1y2&f^2(g$HDXouHgV}E{W|&aJP|B-4ht5RaieW_%Y60aOtJf4F~Tx= literal 0 HcmV?d00001 diff --git a/lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.out b/lldb/test/API/functionalities/postmortem/elf-core/linux-loongarch64.out new file mode 100755 index 0000000000000000000000000000000000000000..ea3c61ac5bae7cc26b3afa886831d723036e021a GIT binary patch literal 2944 zcmbtWO>7%Q6n?WC$9DXaq-m%_C~-tk9Yod%vO+0xYZ}uKDS}e*Q=v-EuDyvDyt`q& zj?yX;4i~r}kxN0Ssz9iR0tYTgJ)kN`aLa)c962E>aX?7idNA+J>^SMh0VGD6c{AVp zzBfCwZ)W$M=a(-ijDai)h*6tCme7)P#6)L-W2ZqyPa0A{oQUKI%XQf0wx9@5S{bkp zVXF6pn=I!`I~Cy45|_%$zxuU)(fv>DP{1PnajYuV#TQ?NlQ)0KR6e`+p3$RQY zoiy!z-4Ewc`*EN>-^pRD+%|({(jqs9+SOXzOh?~L`qD0DvWyv}ti zs4JnaE#{T$V%IJK+hlA!rNSe)whY$Oold8K7Ru?_@^w~DJ_IX2l~Zb#d{~#!UqVa% z7q;1Tg&4K(%93*hU3KIT7dhE3=ZfGgE-akM=E|mF=}va8pgpC{&J?DD9z2O6@zO=g zB!f))ilX%>zatg_#>st29GoOl6l*VK#6|~8^|ynl{22+lahci+Fi2L}aiO2}``C@H zx-PJ2suW6ScH3KAG1acYdjmk;s6`;zPh9 z3A8b^3~dsas&bUP8AKz8yvGrh+K@Mn$Cxq0b#0)=^yoZi7v`Q05->Ys>xNZqtj`x} z)c30OVnd%VY|Lr1I9^?E73T}sXRYa(H!R;WoB4IO>D#X7n|9u8tj<(j&*a1FmZux__3U`9Ee-M|q z=|Pn4M#}%gMTOosuF6koFTObWVUG&&FvQ0y3>?BK&-&ocPS`<_@f+1$uRL>faS$Fp z=A(!oL%bcjFX;XpqOUgwx3`!T6A#So8mB$5G7hfgp&?Pqe&D1y-7H=4t4l zcP%qa<}q4c!e5QBp58xEogt@f8 zgoW36C(vI#LNDXIcZ7Z}xQgER7d^$1@BSe5!$ALaFcIAk_*KO3nJD<#<%MpfV5 Date: Mon, 21 Oct 2024 09:05:32 +0000 Subject: [PATCH 212/511] [gn build] Port 911a6f2fcc71 --- .../gn/secondary/lldb/source/Plugins/Process/elf-core/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/elf-core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/elf-core/BUILD.gn index 67a05d391973..cc68b0da4e81 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/elf-core/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/elf-core/BUILD.gn @@ -18,6 +18,7 @@ static_library("elf-core") { "RegisterContextLinuxCore_x86_64.cpp", "RegisterContextPOSIXCore_arm.cpp", "RegisterContextPOSIXCore_arm64.cpp", + "RegisterContextPOSIXCore_loongarch64.cpp", "RegisterContextPOSIXCore_mips64.cpp", "RegisterContextPOSIXCore_powerpc.cpp", "RegisterContextPOSIXCore_ppc64le.cpp", -- GitLab From 4f06f79c03f8392f63f4430fcfcaefa763cf5c93 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 21 Oct 2024 09:10:35 +0000 Subject: [PATCH 213/511] [llvm][llvm-lit] Handle testsuite elapsed time being None The time for all testsuites will always exist because lit measures it itself. For a given testsuite, I guess that it can be None if for example the suite is empty. --- llvm/utils/lit/lit/reports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py index d2d719b076bc..8ec83d698ae8 100755 --- a/llvm/utils/lit/lit/reports.py +++ b/llvm/utils/lit/lit/reports.py @@ -114,7 +114,7 @@ class XunitReport(object): skipped += 1 if t.isFailure(): failures += 1 - time += t.result.elapsed + time += t.result.elapsed or 0.0 name = suite.config.name.replace(".", "-") file.write( -- GitLab From 8507dbaec3f644b8a0c6291f097800d82a4f4b16 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 21 Oct 2024 10:43:45 +0100 Subject: [PATCH 214/511] [llvm][llvm-lit] Add option to create unique result file names if results already exist (#112729) When running a build like: ``` ninja check-clang check-llvm ``` Prior to my changes you ended up with one results file, in this specific case Junit XML: ``` results.xml ``` This would only include the last set of tests lit ran, which were for llvm. To get around this, many CI systems will run one check target, move the file away, then run another, somehow propgating the return code as well. ``` rectode=0 for target in targets: ninja target retcode=$? mv results.xml results-${target}.xml ``` I want to use something like this Buildkite reporting plugin in CI, which needs to have all the results available: https://buildkite.com/docs/agent/v3/cli-annotate#using-annotations-to-report-test-results Modifying CI's build scripts for Windows and Linux is a lot of work. So my changes instead make lit detect an existing result file and modify the file name to find a new file to write to. Now you will get: ``` results.xml results..xml ``` This will work for all result file types since I'm doing it in the base Report class. Now you've got separate files, it's easy to collect them with `/*.xml`. Note that the `` is not ordered. --- llvm/utils/lit/lit/cl_arguments.py | 30 ++++++--- llvm/utils/lit/lit/reports.py | 75 +++++++++++++--------- llvm/utils/lit/tests/unique-output-file.py | 22 +++++++ 3 files changed, 89 insertions(+), 38 deletions(-) create mode 100644 llvm/utils/lit/tests/unique-output-file.py diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py index 5ccae4be0967..c08c51b7b7a2 100644 --- a/llvm/utils/lit/lit/cl_arguments.py +++ b/llvm/utils/lit/lit/cl_arguments.py @@ -175,6 +175,15 @@ def parse_args(): type=lit.reports.TimeTraceReport, help="Write Chrome tracing compatible JSON to the specified file", ) + execution_group.add_argument( + "--use-unique-output-file-name", + help="When enabled, lit will not overwrite existing test report files. " + "Instead it will write to a new file named the same as the output file " + "name but with an extra part before the file extension. For example " + "if results.xml already exists, results..xml will be written " + "to. The is not ordered in any way. [Default: Off]", + action="store_true", + ) execution_group.add_argument( "--timeout", dest="maxIndividualTestTime", @@ -332,16 +341,21 @@ def parse_args(): else: opts.shard = None - opts.reports = filter( - None, - [ - opts.output, - opts.xunit_xml_output, - opts.resultdb_output, - opts.time_trace_output, - ], + opts.reports = list( + filter( + None, + [ + opts.output, + opts.xunit_xml_output, + opts.resultdb_output, + opts.time_trace_output, + ], + ) ) + for report in opts.reports: + report.use_unique_output_file_name = opts.use_unique_output_file_name + return opts diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py index 8ec83d698ae8..8312dcddc769 100755 --- a/llvm/utils/lit/lit/reports.py +++ b/llvm/utils/lit/lit/reports.py @@ -1,7 +1,10 @@ +import abc import base64 import datetime import itertools import json +import os +import tempfile from xml.sax.saxutils import quoteattr as quo @@ -14,11 +17,34 @@ def by_suite_and_test_path(test): return (test.suite.name, id(test.suite), test.path_in_suite) -class JsonReport(object): +class Report(object): def __init__(self, output_file): self.output_file = output_file + # Set by the option parser later. + self.use_unique_output_file_name = False def write_results(self, tests, elapsed): + if self.use_unique_output_file_name: + filename, ext = os.path.splitext(os.path.basename(self.output_file)) + fd, _ = tempfile.mkstemp( + suffix=ext, prefix=f"{filename}.", dir=os.path.dirname(self.output_file) + ) + report_file = os.fdopen(fd, "w") + else: + # Overwrite if the results already exist. + report_file = open(self.output_file, "w") + + with report_file: + self._write_results_to_file(tests, elapsed, report_file) + + @abc.abstractmethod + def _write_results_to_file(self, tests, elapsed, file): + """Write test results to the file object "file".""" + pass + + +class JsonReport(Report): + def _write_results_to_file(self, tests, elapsed, file): unexecuted_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED} tests = [t for t in tests if t.result.code not in unexecuted_codes] # Construct the data we will write. @@ -67,9 +93,8 @@ class JsonReport(object): tests_data.append(test_data) - with open(self.output_file, "w") as file: - json.dump(data, file, indent=2, sort_keys=True) - file.write("\n") + json.dump(data, file, indent=2, sort_keys=True) + file.write("\n") _invalid_xml_chars_dict = { @@ -88,21 +113,18 @@ def remove_invalid_xml_chars(s): return s.translate(_invalid_xml_chars_dict) -class XunitReport(object): - def __init__(self, output_file): - self.output_file = output_file - self.skipped_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED, lit.Test.UNSUPPORTED} +class XunitReport(Report): + skipped_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED, lit.Test.UNSUPPORTED} - def write_results(self, tests, elapsed): + def _write_results_to_file(self, tests, elapsed, file): tests.sort(key=by_suite_and_test_path) tests_by_suite = itertools.groupby(tests, lambda t: t.suite) - with open(self.output_file, "w") as file: - file.write('\n') - file.write('\n'.format(time=elapsed)) - for suite, test_iter in tests_by_suite: - self._write_testsuite(file, suite, list(test_iter)) - file.write("\n") + file.write('\n') + file.write('\n'.format(time=elapsed)) + for suite, test_iter in tests_by_suite: + self._write_testsuite(file, suite, list(test_iter)) + file.write("\n") def _write_testsuite(self, file, suite, tests): skipped = 0 @@ -206,11 +228,8 @@ def gen_resultdb_test_entry( return test_data -class ResultDBReport(object): - def __init__(self, output_file): - self.output_file = output_file - - def write_results(self, tests, elapsed): +class ResultDBReport(Report): + def _write_results_to_file(self, tests, elapsed, file): unexecuted_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED} tests = [t for t in tests if t.result.code not in unexecuted_codes] data = {} @@ -249,17 +268,14 @@ class ResultDBReport(object): ) ) - with open(self.output_file, "w") as file: - json.dump(data, file, indent=2, sort_keys=True) - file.write("\n") + json.dump(data, file, indent=2, sort_keys=True) + file.write("\n") -class TimeTraceReport(object): - def __init__(self, output_file): - self.output_file = output_file - self.skipped_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED, lit.Test.UNSUPPORTED} +class TimeTraceReport(Report): + skipped_codes = {lit.Test.EXCLUDED, lit.Test.SKIPPED, lit.Test.UNSUPPORTED} - def write_results(self, tests, elapsed): + def _write_results_to_file(self, tests, elapsed, file): # Find when first test started so we can make start times relative. first_start_time = min([t.result.start for t in tests]) events = [ @@ -270,8 +286,7 @@ class TimeTraceReport(object): json_data = {"traceEvents": events} - with open(self.output_file, "w") as time_trace_file: - json.dump(json_data, time_trace_file, indent=2, sort_keys=True) + json.dump(json_data, time_trace_file, indent=2, sort_keys=True) def _get_test_event(self, test, first_start_time): test_name = test.getFullName() diff --git a/llvm/utils/lit/tests/unique-output-file.py b/llvm/utils/lit/tests/unique-output-file.py new file mode 100644 index 000000000000..fea57682d9fd --- /dev/null +++ b/llvm/utils/lit/tests/unique-output-file.py @@ -0,0 +1,22 @@ +## Check that lit will not overwrite existing result files when given +## --use-unique-output-file-name. + +## Files are overwritten without the option. +# RUN: rm -f %t.xunit*.xml +# RUN: echo "test" > %t.xunit.xml +# RUN: not %{lit} --xunit-xml-output %t.xunit.xml %{inputs}/xunit-output +# RUN: FileCheck < %t.xunit.xml %s --check-prefix=NEW +# NEW: +# NEW-NEXT: +## (other tests will check the contents of the whole file) + +# RUN: rm -f %t.xunit*.xml +# RUN: echo "test" > %t.xunit.xml +## Files should not be overwritten with the option. +# RUN: not %{lit} --xunit-xml-output %t.xunit.xml --use-unique-output-file-name %{inputs}/xunit-output +# RUN: FileCheck < %t.xunit.xml %s --check-prefix=EXISTING +# EXISTING: test +## Results in a new file with some discriminator added. +# RUN: ls -l %t.xunit*.xml | wc -l | FileCheck %s --check-prefix=NUMFILES +# NUMFILES: 2 +# RUN: FileCheck < %t.xunit.*.xml %s --check-prefix=NEW -- GitLab From f1ba8943c88ba2b53aaad407933dbb4b48b029d3 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Mon, 21 Oct 2024 11:44:31 +0200 Subject: [PATCH 215/511] [LLD][COFF] Support anti-dependency symbols (#112542) Co-authored-by: Billy Laws Anti-dependency symbols are allowed to be duplicated, with the first definition taking precedence. If a regular weak alias is present, it is preferred over an anti-dependency definition. Chaining anti-dependencies is not allowed. --- lld/COFF/Driver.cpp | 4 +- lld/COFF/InputFiles.cpp | 39 +++++++++------ lld/COFF/SymbolTable.cpp | 2 +- lld/COFF/Symbols.cpp | 5 ++ lld/COFF/Symbols.h | 10 +++- lld/test/COFF/weak-antidep-chain.test | 68 +++++++++++++++++++++++++++ lld/test/COFF/weak-antidep.test | 54 +++++++++++++++++++++ 7 files changed, 163 insertions(+), 19 deletions(-) create mode 100644 lld/test/COFF/weak-antidep-chain.test create mode 100644 lld/test/COFF/weak-antidep.test diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 12e1ae628112..e7f768789271 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -737,7 +737,7 @@ StringRef LinkerDriver::mangleMaybe(Symbol *s) { // If we find a similar mangled symbol, make this an alias to it and return // its name. log(unmangled->getName() + " aliased to " + mangled->getName()); - unmangled->weakAlias = ctx.symtab.addUndefined(mangled->getName()); + unmangled->setWeakAlias(ctx.symtab.addUndefined(mangled->getName())); return mangled->getName(); } @@ -2520,7 +2520,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { continue; if (auto *u = dyn_cast(sym)) if (!u->weakAlias) - u->weakAlias = ctx.symtab.addUndefined(to); + u->setWeakAlias(ctx.symtab.addUndefined(to)); } // If any inputs are bitcode files, the LTO code generator may create diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index b19275abebc3..292c3bfc1eaa 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -74,19 +74,25 @@ std::string lld::toString(const coff::InputFile *file) { /// If Source is Undefined and has no weak alias set, makes it a weak /// alias to Target. static void checkAndSetWeakAlias(COFFLinkerContext &ctx, InputFile *f, - Symbol *source, Symbol *target) { + Symbol *source, Symbol *target, + bool isAntiDep) { if (auto *u = dyn_cast(source)) { if (u->weakAlias && u->weakAlias != target) { - // Weak aliases as produced by GCC are named in the form - // .weak.., where is the name - // of another symbol emitted near the weak symbol. - // Just use the definition from the first object file that defined - // this weak symbol. - if (ctx.config.allowDuplicateWeak) + // Ignore duplicated anti-dependency symbols. + if (isAntiDep) return; - ctx.symtab.reportDuplicate(source, f); + if (!u->isAntiDep) { + // Weak aliases as produced by GCC are named in the form + // .weak.., where is the name + // of another symbol emitted near the weak symbol. + // Just use the definition from the first object file that defined + // this weak symbol. + if (ctx.config.allowDuplicateWeak) + return; + ctx.symtab.reportDuplicate(source, f); + } } - u->weakAlias = target; + u->setWeakAlias(target, isAntiDep); } } @@ -436,7 +442,8 @@ void ObjFile::initializeSymbols() { uint32_t numSymbols = coffObj->getNumberOfSymbols(); symbols.resize(numSymbols); - SmallVector, 8> weakAliases; + SmallVector, 8> + weakAliases; std::vector pendingIndexes; pendingIndexes.reserve(numSymbols); @@ -451,8 +458,8 @@ void ObjFile::initializeSymbols() { symbols[i] = createUndefined(coffSym); } else if (coffSym.isWeakExternal()) { symbols[i] = createUndefined(coffSym); - uint32_t tagIndex = coffSym.getAux()->TagIndex; - weakAliases.emplace_back(symbols[i], tagIndex); + weakAliases.emplace_back(symbols[i], + coffSym.getAux()); } else if (std::optional optSym = createDefined(coffSym, comdatDefs, prevailingComdat)) { symbols[i] = *optSym; @@ -491,8 +498,10 @@ void ObjFile::initializeSymbols() { for (auto &kv : weakAliases) { Symbol *sym = kv.first; - uint32_t idx = kv.second; - checkAndSetWeakAlias(ctx, this, sym, symbols[idx]); + const coff_aux_weak_external *aux = kv.second; + checkAndSetWeakAlias(ctx, this, sym, symbols[aux->TagIndex], + aux->Characteristics == + IMAGE_WEAK_EXTERN_ANTI_DEPENDENCY); } // Free the memory used by sparseChunks now that symbol loading is finished. @@ -1202,7 +1211,7 @@ void BitcodeFile::parse() { sym = ctx.symtab.addUndefined(symName, this, true); std::string fallback = std::string(objSym.getCOFFWeakExternalFallback()); Symbol *alias = ctx.symtab.addUndefined(saver.save(fallback)); - checkAndSetWeakAlias(ctx, this, sym, alias); + checkAndSetWeakAlias(ctx, this, sym, alias, false); } else if (comdatIndex != -1) { if (symName == obj->getComdatTable()[comdatIndex].first) { sym = comdat[comdatIndex].first; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index fa09ea64babb..230ae74dfb21 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -315,7 +315,7 @@ void SymbolTable::loadMinGWSymbols() { warn("Resolving " + origName + " by linking to " + newName); else log("Resolving " + origName + " by linking to " + newName); - undef->weakAlias = l; + undef->setWeakAlias(l); continue; } } diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp index 89f2da02bdcf..f2fa2392ecbb 100644 --- a/lld/COFF/Symbols.cpp +++ b/lld/COFF/Symbols.cpp @@ -116,6 +116,9 @@ Symbol *Undefined::getWeakAlias() { // A weak alias may be a weak alias to another symbol, so check recursively. DenseSet weakChain; for (Symbol *a = weakAlias; a; a = cast(a)->weakAlias) { + // Anti-dependency symbols can't be chained. + if (a->isAntiDep) + break; if (!isa(a)) return a; if (!weakChain.insert(a).second) @@ -135,6 +138,7 @@ bool Undefined::resolveWeakAlias() { // Symbols. For that reason we need to check which type of symbol we // are dealing with and copy the correct number of bytes. StringRef name = getName(); + bool wasAntiDep = isAntiDep; if (isa(d)) memcpy(this, d, sizeof(DefinedRegular)); else if (isa(d)) @@ -144,6 +148,7 @@ bool Undefined::resolveWeakAlias() { nameData = name.data(); nameSize = name.size(); + isAntiDep = wasAntiDep; return true; } diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h index a898ebf05fd8..ff84ff8ad7b2 100644 --- a/lld/COFF/Symbols.h +++ b/lld/COFF/Symbols.h @@ -100,7 +100,7 @@ protected: : symbolKind(k), isExternal(true), isCOMDAT(false), writtenToSymtab(false), isUsedInRegularObj(false), pendingArchiveLoad(false), isGCRoot(false), isRuntimePseudoReloc(false), - deferUndefined(false), canInline(true), isWeak(false), + deferUndefined(false), canInline(true), isWeak(false), isAntiDep(false), nameSize(n.size()), nameData(n.empty() ? nullptr : n.data()) { assert((!n.empty() || k <= LastDefinedCOFFKind) && "If the name is empty, the Symbol must be a DefinedCOFF."); @@ -145,6 +145,9 @@ public: // managing weak symbol overrides. unsigned isWeak : 1; + // True if the symbol is an anti-dependency. + unsigned isAntiDep : 1; + protected: // Symbol name length. Assume symbol lengths fit in a 32-bit integer. uint32_t nameSize; @@ -345,6 +348,11 @@ public: return dyn_cast_or_null(getWeakAlias()); } + void setWeakAlias(Symbol *sym, bool antiDep = false) { + weakAlias = sym; + isAntiDep = antiDep; + } + // If this symbol is external weak, replace this object with aliased symbol. bool resolveWeakAlias(); }; diff --git a/lld/test/COFF/weak-antidep-chain.test b/lld/test/COFF/weak-antidep-chain.test new file mode 100644 index 000000000000..f6e32c5139a4 --- /dev/null +++ b/lld/test/COFF/weak-antidep-chain.test @@ -0,0 +1,68 @@ +REQUIRES: x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=x86_64-windows chain-bad.s -o chain-bad.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows chain-bad2.s -o chain-bad2.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows globals-bad.s -o globals-bad.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows chain-good.s -o chain-good.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows chain-good2.s -o chain-good2.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows globals-good.s -o globals-good.obj + +Temporary anti-dependency chains are allowed as long as they are broken by non-alias symbols. + +RUN: lld-link -machine:amd64 -dll -noentry -out:test.dll chain-good.obj globals-good.obj +RUN: lld-link -machine:amd64 -dll -noentry -out:test.dll chain-good2.obj globals-good.obj + +Chaining of anti-dependency symbols is not allowed. + +RUN: not lld-link -machine:amd64 -dll -noentry -out:test.dll chain-bad.obj globals-bad.obj 2>&1 \ +RUN: | FileCheck -check-prefix=ANTIDEP %s +RUN: not lld-link -machine:amd64 -dll -noentry -out:test.dll chain-bad2.obj globals-bad.obj 2>&1 \ +RUN: | FileCheck -check-prefix=ANTIDEP %s + +ANTIDEP: lld-link: error: undefined symbol: sym +ANTIDEP-NEXT: >>> referenced by chain-bad + +#--- chain-bad.s + .weak_anti_dep sym +.set sym,sym2 + .weak_anti_dep sym2 +.set sym2,sym3 + +#--- chain-bad2.s + .weak_anti_dep sym2 +.set sym2,sym3 + .weak sym +.set sym,sym2 + +#--- globals-bad.s + .section .test,"r" + .global sym3 +.set sym3,3 + +#--- chain-good.s + .weak_anti_dep sym +.set sym,sym2 + .weak_anti_dep sym2 +.set sym2,sym3 + .weak_anti_dep sym3 +.set sym3,sym4 + .weak_anti_dep sym4 + +#--- chain-good2.s + .weak_anti_dep sym +.set sym,sym2 + .weak_anti_dep sym2 +.set sym2,sym3 + .weak_anti_dep sym3 +.set sym3,weak_sym + .weak weak_sym +.set weak_sym,sym4 + .weak_anti_dep sym4 + +#--- globals-good.s + .section .test,"r" + .global sym2 +.set sym2,2 + .global sym4 +.set sym4,4 diff --git a/lld/test/COFF/weak-antidep.test b/lld/test/COFF/weak-antidep.test new file mode 100644 index 000000000000..691aa8f18433 --- /dev/null +++ b/lld/test/COFF/weak-antidep.test @@ -0,0 +1,54 @@ +REQUIRES: x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=x86_64-windows antidep.s -o antidep.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows antidep2.s -o antidep2.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows weak.s -o weak.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows test.s -o test.obj + +Check that a regular weak alias overrides an anti-dependency symbol. + +RUN: lld-link -dll -noentry -out:out1.dll antidep.obj weak.obj test.obj +RUN: llvm-readobj --hex-dump=.test out1.dll | FileCheck --check-prefix=CHECK2 %s + +RUN: lld-link -dll -noentry -out:out2.dll weak.obj antidep.obj test.obj +RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=CHECK2 %s + +RUN: lld-link -dll -noentry -out:out3.dll antidep.obj weak.obj test.obj -lld-allow-duplicate-weak +RUN: llvm-readobj --hex-dump=.test out3.dll | FileCheck --check-prefix=CHECK2 %s + +RUN: lld-link -dll -noentry -out:out4.dll weak.obj antidep.obj test.obj -lld-allow-duplicate-weak +RUN: llvm-readobj --hex-dump=.test out4.dll | FileCheck --check-prefix=CHECK2 %s + +When an anti-dependency symbol is duplicated, the first definition takes precedence over subsequent ones. + +RUN: lld-link -dll -noentry -out:out5.dll antidep.obj antidep2.obj test.obj +RUN: llvm-readobj --hex-dump=.test out5.dll | FileCheck --check-prefix=CHECK1 %s + +RUN: lld-link -dll -noentry -out:out6.dll antidep2.obj antidep.obj test.obj +RUN: llvm-readobj --hex-dump=.test out6.dll | FileCheck --check-prefix=CHECK2 %s + +CHECK1: 01000000 +CHECK2: 02000000 + +#--- antidep.s + .weak_anti_dep sym +.set sym,target1 + +#--- antidep2.s + .weak_anti_dep sym +.set sym,target2 + +#--- weak.s + .weak sym +.set sym,target2 + +#--- test.s + .section .target,"dr" + .globl target1 +.set target1,1 + .globl target2 +.set target2,2 + + .section .test,"dr" + .long sym -- GitLab From 159f25301763215ffc49c3c3aa6cb8095a990b41 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 21 Oct 2024 12:07:09 +0200 Subject: [PATCH 216/511] [clang][bytecode] Diagnose invalid declrefs differently if we've... (#113140) ... tried their initializer already. In that case, diagnose the non-const initializer instead of the reference to a non-constexpr variable later. This is used in a lot of openmp tests. --- clang/lib/AST/ByteCode/Compiler.cpp | 5 +++-- clang/lib/AST/ByteCode/Interp.h | 13 +++++++++++-- clang/lib/AST/ByteCode/Opcodes.td | 2 +- clang/test/AST/ByteCode/openmp.cpp | 13 +++++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 clang/test/AST/ByteCode/openmp.cpp diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 3f068aa8c189..b960954d4754 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6097,7 +6097,8 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { if (VD->evaluateValue()) return revisit(VD); - return this->emitInvalidDeclRef(cast(E), E); + return this->emitInvalidDeclRef(cast(E), + /*InitializerFailed=*/true, E); } } } else { @@ -6123,7 +6124,7 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { } if (const auto *DRE = dyn_cast(E)) - return this->emitInvalidDeclRef(DRE, E); + return this->emitInvalidDeclRef(DRE, /*InitializerFailed=*/false, E); return false; } diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 1469fac5a177..c95b18ef72c9 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2818,9 +2818,18 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind, return false; } -inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC, - const DeclRefExpr *DR) { +inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR, + bool InitializerFailed) { assert(DR); + + if (InitializerFailed) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + const auto *VD = cast(DR->getDecl()); + S.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << VD; + S.Note(VD->getLocation(), diag::note_declared_at); + return false; + } + return CheckDeclRef(S, OpPC, DR); } diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index a1970f25ca97..9136e6b51660 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -769,7 +769,7 @@ def InvalidCast : Opcode { } def InvalidDeclRef : Opcode { - let Args = [ArgDeclRef]; + let Args = [ArgDeclRef, ArgBool]; } def SizelessVectorElementSize : Opcode; diff --git a/clang/test/AST/ByteCode/openmp.cpp b/clang/test/AST/ByteCode/openmp.cpp new file mode 100644 index 000000000000..e05fbe086625 --- /dev/null +++ b/clang/test/AST/ByteCode/openmp.cpp @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both -fopenmp %s +// RUN: %clang_cc1 -verify=ref,both -fopenmp %s + +int test1() { + int i; + int &j = i; // both-note {{declared here}} + float *f; + // both-note@+2 {{initializer of 'j' is not a constant expression}} + // both-error@+1 {{integral constant expression}} + #pragma omp for simd aligned(f:j) + for (int i = 0; i < 10; ++i); +} + -- GitLab From 95b4128c6a87e9b894aa75524e63be147cca790b Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Mon, 21 Oct 2024 11:27:34 +0100 Subject: [PATCH 217/511] [flang][debug] Don't generate debug for compiler-generated variables (#112423) Flang generates many globals to handle derived types. There was a check in debug info to filter them based on the information that their names start with a period. This changed since PR#104859 where 'X' is being used instead of '.'. This PR fixes this issue by also adding 'X' in that list. As user variables gets lower cased by the NameUniquer, there is no risk that those will be filtered out. I added a test for that to be sure. --- .../flang/Optimizer/Support/InternalNames.h | 4 ++++ flang/lib/Optimizer/Support/InternalNames.cpp | 4 ++++ .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 5 +---- .../test/Integration/debug-extra-global-2.f90 | 8 ++++++++ flang/test/Integration/debug-extra-global.f90 | 14 ++++++++++++++ flang/test/Transforms/debug-extra-global.fir | 18 ++++++++++++++++++ 6 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 flang/test/Integration/debug-extra-global-2.f90 create mode 100644 flang/test/Integration/debug-extra-global.f90 create mode 100644 flang/test/Transforms/debug-extra-global.fir diff --git a/flang/include/flang/Optimizer/Support/InternalNames.h b/flang/include/flang/Optimizer/Support/InternalNames.h index 67ab36cf8da7..41f2cb9842dc 100644 --- a/flang/include/flang/Optimizer/Support/InternalNames.h +++ b/flang/include/flang/Optimizer/Support/InternalNames.h @@ -184,6 +184,10 @@ struct NameUniquer { static std::string replaceSpecialSymbols(const std::string &name); + /// Returns true if the passed name denotes a special symbol (e.g. global + /// symbol generated for derived type description). + static bool isSpecialSymbol(llvm::StringRef name); + private: static std::string intAsString(std::int64_t i); static std::string doKind(std::int64_t kind); diff --git a/flang/lib/Optimizer/Support/InternalNames.cpp b/flang/lib/Optimizer/Support/InternalNames.cpp index 58a5da5de797..011021c9f035 100644 --- a/flang/lib/Optimizer/Support/InternalNames.cpp +++ b/flang/lib/Optimizer/Support/InternalNames.cpp @@ -411,3 +411,7 @@ fir::NameUniquer::dropTypeConversionMarkers(llvm::StringRef mangledTypeName) { std::string fir::NameUniquer::replaceSpecialSymbols(const std::string &name) { return std::regex_replace(name, std::regex{"\\."}, "X"); } + +bool fir::NameUniquer::isSpecialSymbol(llvm::StringRef name) { + return !name.empty() && (name[0] == '.' || name[0] == 'X'); +} diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 400a8648dd7e..3a437c7a0f01 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -211,10 +211,7 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, if (result.first != fir::NameUniquer::NameKind::VARIABLE) return; - // Discard entries that describe a derived type. Usually start with '.c.', - // '.dt.' or '.n.'. It would be better if result of the deconstruct had a flag - // for such values so that we dont have to look at string values. - if (!result.second.name.empty() && result.second.name[0] == '.') + if (fir::NameUniquer::isSpecialSymbol(result.second.name)) return; unsigned line = getLineFromLoc(globalOp.getLoc()); diff --git a/flang/test/Integration/debug-extra-global-2.f90 b/flang/test/Integration/debug-extra-global-2.f90 new file mode 100644 index 000000000000..59cb4b66def5 --- /dev/null +++ b/flang/test/Integration/debug-extra-global-2.f90 @@ -0,0 +1,8 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +module m + integer XcX +end + +! Test that global starting with 'X' don't get filtered. +! CHECK: !DIGlobalVariable(name: "xcx", linkageName: "_QMmExcx"{{.*}}) diff --git a/flang/test/Integration/debug-extra-global.f90 b/flang/test/Integration/debug-extra-global.f90 new file mode 100644 index 000000000000..c0ad2e306386 --- /dev/null +++ b/flang/test/Integration/debug-extra-global.f90 @@ -0,0 +1,14 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +program test + type t1 + integer :: XcX + integer :: xdtx + end type + type(t1) :: var + var%XcX = 2 + var%xdtx = 3 +end + +! Test that there is no debug info for compiler generated globals. +! CHECK-NOT: DIGlobalVariable diff --git a/flang/test/Transforms/debug-extra-global.fir b/flang/test/Transforms/debug-extra-global.fir new file mode 100644 index 000000000000..d3bc22ad0c59 --- /dev/null +++ b/flang/test/Transforms/debug-extra-global.fir @@ -0,0 +1,18 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { + fir.global linkonce_odr @_QFEXnXxcx constant target : !fir.char<1,3> { + %0 = fir.string_lit "xcx"(3) : !fir.char<1,3> + fir.has_value %0 : !fir.char<1,3> + } loc(#loc1) + fir.global linkonce_odr @_QFEXnXxdtx constant target : !fir.char<1,4> { + %0 = fir.string_lit "xdtx"(4) : !fir.char<1,4> + fir.has_value %0 : !fir.char<1,4> + } loc(#loc1) +} +#loc1 = loc("derived.f90":24:1) + +// Test that no di_global_variable gets created for these compile generated +// globals. + +// CHECK-NOT: #di_global_variable -- GitLab From d906ac52ab8ee46090a6696f4ffb34c40ee6abb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Mon, 21 Oct 2024 12:30:03 +0200 Subject: [PATCH 218/511] [clang][AVR] Fix basic type size/alignment values to match avr-gcc. (#111290) Closes #102172 --- clang/include/clang/Basic/TargetInfo.h | 12 ++-- clang/lib/Basic/TargetInfo.cpp | 2 + clang/lib/Basic/Targets/AVR.h | 4 ++ clang/test/CodeGen/cx-complex-range.c | 80 ++++++++++++------------ clang/test/CodeGen/mdouble.c | 5 +- clang/test/Sema/avr-size-align.c | 49 +++++++++++++++ clang/test/Sema/unbounded-array-bounds.c | 14 ++--- 7 files changed, 109 insertions(+), 57 deletions(-) create mode 100644 clang/test/Sema/avr-size-align.c diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 577838506062..e7469e1e9891 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -87,6 +87,7 @@ enum class FloatModeKind { struct TransferrableTargetInfo { unsigned char PointerWidth, PointerAlign; unsigned char BoolWidth, BoolAlign; + unsigned char ShortWidth, ShortAlign; unsigned char IntWidth, IntAlign; unsigned char HalfWidth, HalfAlign; unsigned char BFloat16Width, BFloat16Align; @@ -497,13 +498,10 @@ public: unsigned getCharWidth() const { return 8; } // FIXME unsigned getCharAlign() const { return 8; } // FIXME - /// Return the size of 'signed short' and 'unsigned short' for this - /// target, in bits. - unsigned getShortWidth() const { return 16; } // FIXME - - /// Return the alignment of 'signed short' and 'unsigned short' for - /// this target. - unsigned getShortAlign() const { return 16; } // FIXME + /// getShortWidth/Align - Return the size of 'signed short' and + /// 'unsigned short' for this target, in bits. + unsigned getShortWidth() const { return ShortWidth; } + unsigned getShortAlign() const { return ShortAlign; } /// getIntWidth/Align - Return the size of 'signed int' and 'unsigned int' for /// this target, in bits. diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 921953338210..145ca545854d 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -70,6 +70,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { HasStrictFP = false; PointerWidth = PointerAlign = 32; BoolWidth = BoolAlign = 8; + ShortWidth = ShortAlign = 16; IntWidth = IntAlign = 32; LongWidth = LongAlign = 32; LongLongWidth = LongLongAlign = 64; @@ -437,6 +438,7 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) { // what these normally are for the target. // We also define long long and long double here, although the // OpenCL standard only mentions these as "reserved". + ShortWidth = ShortAlign = 16; IntWidth = IntAlign = 32; LongWidth = LongAlign = 64; LongLongWidth = LongLongAlign = 128; diff --git a/clang/lib/Basic/Targets/AVR.h b/clang/lib/Basic/Targets/AVR.h index feeb04f37eeb..0a2f51747f8a 100644 --- a/clang/lib/Basic/Targets/AVR.h +++ b/clang/lib/Basic/Targets/AVR.h @@ -29,6 +29,8 @@ public: TLSSupported = false; PointerWidth = 16; PointerAlign = 8; + ShortWidth = 16; + ShortAlign = 8; IntWidth = 16; IntAlign = 8; LongWidth = 32; @@ -65,6 +67,8 @@ public: return std::nullopt; } + bool allowsLargerPreferedTypeAlignment() const override { return false; } + BuiltinVaListKind getBuiltinVaListKind() const override { return TargetInfo::VoidPtrBuiltinVaList; } diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c index d83d4d02ac19..b2259031d756 100644 --- a/clang/test/CodeGen/cx-complex-range.c +++ b/clang/test/CodeGen/cx-complex-range.c @@ -1264,24 +1264,24 @@ _Complex float mulf(_Complex float a, _Complex float b) { // AVRFP32-SAME: float noundef [[A_COERCE0:%.*]], float noundef [[A_COERCE1:%.*]], float noundef [[B_COERCE0:%.*]], float noundef [[B_COERCE1:%.*]]) addrspace(1) #[[ATTR0]] { // AVRFP32-NEXT: entry: // AVRFP32-NEXT: [[RETVAL:%.*]] = alloca { float, float }, align 1 -// AVRFP32-NEXT: [[A:%.*]] = alloca { float, float }, align 4 -// AVRFP32-NEXT: [[B:%.*]] = alloca { float, float }, align 4 +// AVRFP32-NEXT: [[A:%.*]] = alloca { float, float }, align 1 +// AVRFP32-NEXT: [[B:%.*]] = alloca { float, float }, align 1 // AVRFP32-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 0 -// AVRFP32-NEXT: store float [[A_COERCE0]], ptr [[TMP0]], align 4 +// AVRFP32-NEXT: store float [[A_COERCE0]], ptr [[TMP0]], align 1 // AVRFP32-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 1 -// AVRFP32-NEXT: store float [[A_COERCE1]], ptr [[TMP1]], align 4 +// AVRFP32-NEXT: store float [[A_COERCE1]], ptr [[TMP1]], align 1 // AVRFP32-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 0 -// AVRFP32-NEXT: store float [[B_COERCE0]], ptr [[TMP2]], align 4 +// AVRFP32-NEXT: store float [[B_COERCE0]], ptr [[TMP2]], align 1 // AVRFP32-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 1 -// AVRFP32-NEXT: store float [[B_COERCE1]], ptr [[TMP3]], align 4 +// AVRFP32-NEXT: store float [[B_COERCE1]], ptr [[TMP3]], align 1 // AVRFP32-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 0 -// AVRFP32-NEXT: [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4 +// AVRFP32-NEXT: [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 1 // AVRFP32-NEXT: [[A_IMAGP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 1 -// AVRFP32-NEXT: [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4 +// AVRFP32-NEXT: [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 1 // AVRFP32-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 0 -// AVRFP32-NEXT: [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4 +// AVRFP32-NEXT: [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 1 // AVRFP32-NEXT: [[B_IMAGP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 1 -// AVRFP32-NEXT: [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4 +// AVRFP32-NEXT: [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 1 // AVRFP32-NEXT: [[TMP4:%.*]] = call addrspace(1) float @llvm.fabs.f32(float [[B_REAL]]) // AVRFP32-NEXT: [[TMP5:%.*]] = call addrspace(1) float @llvm.fabs.f32(float [[B_IMAG]]) // AVRFP32-NEXT: [[ABS_CMP:%.*]] = fcmp ugt float [[TMP4]], [[TMP5]] @@ -1321,24 +1321,24 @@ _Complex float mulf(_Complex float a, _Complex float b) { // AVRFP64-LABEL: define dso_local void @divd( // AVRFP64-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 1 [[AGG_RESULT:%.*]], double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) addrspace(1) #[[ATTR0]] { // AVRFP64-NEXT: entry: -// AVRFP64-NEXT: [[A:%.*]] = alloca { double, double }, align 8 -// AVRFP64-NEXT: [[B:%.*]] = alloca { double, double }, align 8 +// AVRFP64-NEXT: [[A:%.*]] = alloca { double, double }, align 1 +// AVRFP64-NEXT: [[B:%.*]] = alloca { double, double }, align 1 // AVRFP64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 -// AVRFP64-NEXT: store double [[A_COERCE0]], ptr [[TMP0]], align 8 +// AVRFP64-NEXT: store double [[A_COERCE0]], ptr [[TMP0]], align 1 // AVRFP64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 1 -// AVRFP64-NEXT: store double [[A_COERCE1]], ptr [[TMP1]], align 8 +// AVRFP64-NEXT: store double [[A_COERCE1]], ptr [[TMP1]], align 1 // AVRFP64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 -// AVRFP64-NEXT: store double [[B_COERCE0]], ptr [[TMP2]], align 8 +// AVRFP64-NEXT: store double [[B_COERCE0]], ptr [[TMP2]], align 1 // AVRFP64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 1 -// AVRFP64-NEXT: store double [[B_COERCE1]], ptr [[TMP3]], align 8 +// AVRFP64-NEXT: store double [[B_COERCE1]], ptr [[TMP3]], align 1 // AVRFP64-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 -// AVRFP64-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 +// AVRFP64-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 1 // AVRFP64-NEXT: [[A_IMAGP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 1 -// AVRFP64-NEXT: [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8 +// AVRFP64-NEXT: [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 1 // AVRFP64-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 -// AVRFP64-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 +// AVRFP64-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 1 // AVRFP64-NEXT: [[B_IMAGP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 1 -// AVRFP64-NEXT: [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8 +// AVRFP64-NEXT: [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 1 // AVRFP64-NEXT: [[TMP4:%.*]] = call addrspace(1) double @llvm.fabs.f64(double [[B_REAL]]) // AVRFP64-NEXT: [[TMP5:%.*]] = call addrspace(1) double @llvm.fabs.f64(double [[B_IMAG]]) // AVRFP64-NEXT: [[ABS_CMP:%.*]] = fcmp ugt double [[TMP4]], [[TMP5]] @@ -1862,24 +1862,24 @@ _Complex double divd(_Complex double a, _Complex double b) { // AVRFP32-SAME: float noundef [[A_COERCE0:%.*]], float noundef [[A_COERCE1:%.*]], float noundef [[B_COERCE0:%.*]], float noundef [[B_COERCE1:%.*]]) addrspace(1) #[[ATTR0]] { // AVRFP32-NEXT: entry: // AVRFP32-NEXT: [[RETVAL:%.*]] = alloca { float, float }, align 1 -// AVRFP32-NEXT: [[A:%.*]] = alloca { float, float }, align 4 -// AVRFP32-NEXT: [[B:%.*]] = alloca { float, float }, align 4 +// AVRFP32-NEXT: [[A:%.*]] = alloca { float, float }, align 1 +// AVRFP32-NEXT: [[B:%.*]] = alloca { float, float }, align 1 // AVRFP32-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 0 -// AVRFP32-NEXT: store float [[A_COERCE0]], ptr [[TMP0]], align 4 +// AVRFP32-NEXT: store float [[A_COERCE0]], ptr [[TMP0]], align 1 // AVRFP32-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 1 -// AVRFP32-NEXT: store float [[A_COERCE1]], ptr [[TMP1]], align 4 +// AVRFP32-NEXT: store float [[A_COERCE1]], ptr [[TMP1]], align 1 // AVRFP32-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 0 -// AVRFP32-NEXT: store float [[B_COERCE0]], ptr [[TMP2]], align 4 +// AVRFP32-NEXT: store float [[B_COERCE0]], ptr [[TMP2]], align 1 // AVRFP32-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 1 -// AVRFP32-NEXT: store float [[B_COERCE1]], ptr [[TMP3]], align 4 +// AVRFP32-NEXT: store float [[B_COERCE1]], ptr [[TMP3]], align 1 // AVRFP32-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 0 -// AVRFP32-NEXT: [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4 +// AVRFP32-NEXT: [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 1 // AVRFP32-NEXT: [[A_IMAGP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i32 0, i32 1 -// AVRFP32-NEXT: [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4 +// AVRFP32-NEXT: [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 1 // AVRFP32-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 0 -// AVRFP32-NEXT: [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4 +// AVRFP32-NEXT: [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 1 // AVRFP32-NEXT: [[B_IMAGP:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i32 0, i32 1 -// AVRFP32-NEXT: [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4 +// AVRFP32-NEXT: [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 1 // AVRFP32-NEXT: [[MUL_AC:%.*]] = fmul float [[A_REAL]], [[B_REAL]] // AVRFP32-NEXT: [[MUL_BD:%.*]] = fmul float [[A_IMAG]], [[B_IMAG]] // AVRFP32-NEXT: [[MUL_AD:%.*]] = fmul float [[A_REAL]], [[B_IMAG]] @@ -1896,24 +1896,24 @@ _Complex double divd(_Complex double a, _Complex double b) { // AVRFP64-LABEL: define dso_local void @muld( // AVRFP64-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 1 [[AGG_RESULT:%.*]], double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) addrspace(1) #[[ATTR0]] { // AVRFP64-NEXT: entry: -// AVRFP64-NEXT: [[A:%.*]] = alloca { double, double }, align 8 -// AVRFP64-NEXT: [[B:%.*]] = alloca { double, double }, align 8 +// AVRFP64-NEXT: [[A:%.*]] = alloca { double, double }, align 1 +// AVRFP64-NEXT: [[B:%.*]] = alloca { double, double }, align 1 // AVRFP64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 -// AVRFP64-NEXT: store double [[A_COERCE0]], ptr [[TMP0]], align 8 +// AVRFP64-NEXT: store double [[A_COERCE0]], ptr [[TMP0]], align 1 // AVRFP64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 1 -// AVRFP64-NEXT: store double [[A_COERCE1]], ptr [[TMP1]], align 8 +// AVRFP64-NEXT: store double [[A_COERCE1]], ptr [[TMP1]], align 1 // AVRFP64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 -// AVRFP64-NEXT: store double [[B_COERCE0]], ptr [[TMP2]], align 8 +// AVRFP64-NEXT: store double [[B_COERCE0]], ptr [[TMP2]], align 1 // AVRFP64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 1 -// AVRFP64-NEXT: store double [[B_COERCE1]], ptr [[TMP3]], align 8 +// AVRFP64-NEXT: store double [[B_COERCE1]], ptr [[TMP3]], align 1 // AVRFP64-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 -// AVRFP64-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 +// AVRFP64-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 1 // AVRFP64-NEXT: [[A_IMAGP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 1 -// AVRFP64-NEXT: [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8 +// AVRFP64-NEXT: [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 1 // AVRFP64-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 -// AVRFP64-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 +// AVRFP64-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 1 // AVRFP64-NEXT: [[B_IMAGP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 1 -// AVRFP64-NEXT: [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8 +// AVRFP64-NEXT: [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 1 // AVRFP64-NEXT: [[MUL_AC:%.*]] = fmul double [[A_REAL]], [[B_REAL]] // AVRFP64-NEXT: [[MUL_BD:%.*]] = fmul double [[A_IMAG]], [[B_IMAG]] // AVRFP64-NEXT: [[MUL_AD:%.*]] = fmul double [[A_REAL]], [[B_IMAG]] diff --git a/clang/test/CodeGen/mdouble.c b/clang/test/CodeGen/mdouble.c index 6c73bd7b1590..dab0c5f834bc 100644 --- a/clang/test/CodeGen/mdouble.c +++ b/clang/test/CodeGen/mdouble.c @@ -6,8 +6,7 @@ double x = 0; int size = sizeof(x); -// FIXME: the double should have an alignment of 1 on AVR, not 4 or 8. -// AVR-FP64: @x ={{.*}} global double {{.*}}, align 8 +// AVR-FP64: @x ={{.*}} global double {{.*}}, align 1 // AVR-FP64: @size ={{.*}} global i16 8 -// AVR-FP32: @x ={{.*}} global float {{.*}}, align 4 +// AVR-FP32: @x ={{.*}} global float {{.*}}, align 1 // AVR-FP32: @size ={{.*}} global i16 4 diff --git a/clang/test/Sema/avr-size-align.c b/clang/test/Sema/avr-size-align.c new file mode 100644 index 000000000000..9fe94410c91c --- /dev/null +++ b/clang/test/Sema/avr-size-align.c @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 %s -triple avr -fsyntax-only + +_Static_assert(sizeof(char) == 1, "sizeof(char) == 1"); +_Static_assert(_Alignof(char) == 1, "_Alignof(char) == 1"); +_Static_assert(__alignof(char) == 1, "__alignof(char) == 1"); + +_Static_assert(sizeof(short) == 2, "sizeof(short) == 2"); +_Static_assert(_Alignof(short) == 1, "_Alignof(short) == 1"); +_Static_assert(__alignof(short) == 1, "__alignof(short) == 1"); + +_Static_assert(sizeof(unsigned short) == 2, "sizeof(unsigned short) == 2"); +_Static_assert(_Alignof(unsigned short) == 1, "_Alignof(unsigned short) == 1"); +_Static_assert(__alignof(unsigned short) == 1, "__alignof(unsigned short) == 1"); + +_Static_assert(sizeof(int) == 2, "sizeof(int) == 2"); +_Static_assert(_Alignof(int) == 1, "_Alignof(int) == 1"); +_Static_assert(__alignof(int) == 1, "__alignof(int) == 1"); + +_Static_assert(sizeof(unsigned int) == 2, "sizeof(unsigned int) == 2"); +_Static_assert(_Alignof(unsigned int) == 1, "_Alignof(unsigned int) == 1"); +_Static_assert(__alignof(unsigned int) == 1, "__alignof(unsigned int) == 1"); + +_Static_assert(sizeof(long) == 4, "sizeof(long) == 4"); +_Static_assert(_Alignof(long) == 1, "_Alignof(long) == 1"); +_Static_assert(__alignof(long) == 1, "__alignof(long) == 1"); + +_Static_assert(sizeof(unsigned long) == 4, "sizeof(unsigned long) == 4"); +_Static_assert(_Alignof(unsigned long) == 1, "_Alignof(unsigned long) == 1"); +_Static_assert(__alignof(unsigned long) == 1, "__alignof(unsigned long) == 1"); + +_Static_assert(sizeof(long long) == 8, "sizeof(long long) == 8"); +_Static_assert(_Alignof(long long) == 1, "_Alignof(long long) == 1"); +_Static_assert(__alignof(long long) == 1, "__alignof(long long) == 1"); + +_Static_assert(sizeof(unsigned long long) == 8, "sizeof(unsigned long long) == 8"); +_Static_assert(_Alignof(unsigned long long) == 1, "_Alignof(unsigned long long) == 1"); +_Static_assert(__alignof(unsigned long long) == 1, "__alignof(unsigned long long) == 1"); + +_Static_assert(sizeof(float) == 4, "sizeof(float) == 4"); +_Static_assert(_Alignof(float) == 1, "_Alignof(float) == 1"); +_Static_assert(__alignof(float) == 1, "__alignof(float) == 1"); + +_Static_assert(sizeof(double) == 4, "sizeof(double) == 4"); +_Static_assert(_Alignof(double) == 1, "_Alignof(double) == 1"); +_Static_assert(__alignof(double) == 1, "__alignof(double) == 1"); + +_Static_assert(sizeof(long double) == 4, "sizeof(long double) == 4"); +_Static_assert(_Alignof(long double) == 1, "_Alignof(long double) == 1"); +_Static_assert(__alignof(long double) == 1, "__alignof(long double) == 1"); diff --git a/clang/test/Sema/unbounded-array-bounds.c b/clang/test/Sema/unbounded-array-bounds.c index 41d1972cf595..b22261a3eaeb 100644 --- a/clang/test/Sema/unbounded-array-bounds.c +++ b/clang/test/Sema/unbounded-array-bounds.c @@ -14,11 +14,11 @@ struct S s[]; // expected-warning {{tentative array definition}} expected-note { void f1(void) { ++s[3].a; ++s[7073650413200313099].b; - // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}} // addr64-warning@-3 {{array index 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}} ++s[7073650].c; - // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} } long long ll[]; // expected-warning {{tentative array definition}} expected-note {{declared here}} addr16-note {{declared here}} addr32-note {{declared here}} @@ -37,21 +37,21 @@ void f2(void) { void f3(struct S p[]) { // expected-note {{declared here}} addr16-note {{declared here}} ++p[3].a; ++p[7073650413200313099].b; - // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}} // addr64-warning@-3 {{array index 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}} ++p[7073650].c; - // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} } void f4(struct S *p) { // expected-note {{declared here}} addr16-note {{declared here}} p += 3; p += 7073650413200313099; - // addr16-warning@-1 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} // addr32-warning@-2 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}} // addr64-warning@-3 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}} p += 7073650; - // addr16-warning@-1 {{the pointer incremented by 7073650 refers past the last possible element for an array in 16-bit address space containing 160-bit (20-byte) elements (max possible 3276 elements)}} + // addr16-warning@-1 {{the pointer incremented by 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}} } struct BQ { @@ -63,7 +63,7 @@ struct BQ bq[]; // expected-warning {{tentative array definition}} addr16-note { void f5(void) { ++bq[0].bigblock[0].a; ++bq[1].bigblock[0].a; - // addr16-warning@-1 {{array index 1 refers past the last possible element for an array in 16-bit address space containing 524160-bit (65520-byte) elements (max possible 1 element)}} + // addr16-warning@-1 {{array index 1 refers past the last possible element for an array in 16-bit address space containing 497952-bit (62244-byte) elements (max possible 1 element)}} } void f6(void) { -- GitLab From a18dd29077c84fc076a4ed431d9e815a3d0b6f24 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 21 Oct 2024 12:42:47 +0200 Subject: [PATCH 219/511] [ConstantFolding] Set signed/implicitTrunc when handling GEP offsets GEP offsets have sext_or_trunc semantics. We were already doing this for the outer-most GEP, but not for the inner ones. I believe one of the sanitizer buildbot failures was due to this, but I did not manage to reproduce the issue or come up with a test case. Usually the problematic case will already be folded away due to index type canonicalization. --- llvm/lib/Analysis/ConstantFolding.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index c0104d2bc261..530e9287d9e1 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -924,7 +924,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, Ptr = cast(GEP->getOperand(0)); SrcElemTy = GEP->getSourceElementType(); Offset = Offset.sadd_ov( - APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps)), + APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps), + /*isSigned=*/true, /*implicitTrunc=*/true), Overflow); } -- GitLab From 25b58c877c851bed9c34362cd69bcd8d8bb65ac4 Mon Sep 17 00:00:00 2001 From: Krasimir Georgiev Date: Mon, 21 Oct 2024 13:08:49 +0200 Subject: [PATCH 220/511] bazelbuild: fix for commit 2ce10 (#113142) bazelbuild: fix for https://github.com/llvm/llvm-project/commit/2ce10f0491142863d3f21cd0adb312ab2cfed107. No functional changes intended. --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 1abc0ccda4c7..64859420b91b 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -119,6 +119,11 @@ libc_support_library( hdrs = ["include/llvm-libc-macros/linux/fcntl-macros.h"], ) +libc_support_library( + name = "llvm_libc_types_size_t", + hdrs = ["include/llvm-libc-types/size_t.h"], +) + ########################### Macro Proxy Header Files ########################### libc_support_library( @@ -3193,6 +3198,7 @@ libc_support_library( ":__support_common", ":__support_cpp_bitset", ":__support_macros_optimization", + ":llvm_libc_types_size_t", ":string_memory_utils", ], ) -- GitLab From df02bcc81d5099d60c2ec037edf8eaeb66456319 Mon Sep 17 00:00:00 2001 From: Krasimir Georgiev Date: Mon, 21 Oct 2024 13:39:25 +0200 Subject: [PATCH 221/511] bazelbuild: fix for commit d80b9cf713fd (#113153) Fix for https://github.com/llvm/llvm-project/commit/d80b9cf713fd1698641c5b265de6b66618991476. No functional changes intended. --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 37e165b78aa1..d52dd4870f16 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1080,6 +1080,7 @@ cc_library( ":ast_matchers", ":basic", ":lex", + ":support", "//llvm:Support", ], ) @@ -1232,6 +1233,7 @@ cc_library( ":ast", ":basic", ":lex", + ":support", "//llvm:Support", ], ) -- GitLab From c47df3e8c8f47bab8a8302757c50710e0e1c43fb Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 21 Oct 2024 11:49:45 +0000 Subject: [PATCH 222/511] [lldb][test] Make vector operator[] return T& to workaround Arm codegen issue Since https://github.com/llvm/llvm-project/pull/109628 landed, this test has been failing on 32-bit Arm. This is due to a codegen problem (whether added or uncovered by the change, not known) where the trap instruction is placed after the frame pointer and link register are restored. https://github.com/llvm/llvm-project/issues/113154 So the code was: ``` std::__1::vector::operator[](unsigned int): sub sp, sp, #8 str r0, [sp, #4] str r1, [sp] add sp, sp, #8 .inst 0xe7ffdefe bx lr ``` When lldb saw the trap, the PC was inside operator[] but the frame information actually pointed to g. This bug only happens for leaf functions so adding a return type works around it: ``` std::__1::vector::operator[](unsigned int): push {r11, lr} mov r11, sp sub sp, sp, #8 str r0, [sp, #4] str r1, [sp] mov sp, r11 pop {r11, lr} .inst 0xe7ffdefe bx lr ``` (and operator[] should return T& anyway) Now the PC location and frame information should match and the test passes. --- lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp index 4f01827944e1..20db722ef105 100644 --- a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp @@ -1,7 +1,7 @@ namespace std { inline namespace __1 { template struct vector { - void operator[](unsigned) { __builtin_verbose_trap("Bounds error", "out-of-bounds access"); } + T& operator[](unsigned) { __builtin_verbose_trap("Bounds error", "out-of-bounds access"); } }; } // namespace __1 } // namespace std -- GitLab From 46dc91e7d9a1b6dd0144e628519d06954b7b4e53 Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Mon, 21 Oct 2024 08:20:22 -0400 Subject: [PATCH 223/511] [SystemZ][z/OS] Add new openFileForReadBinary function, and pass IsText parameter to getBufferForFile (#111723) This patch adds an IsText parameter to the following getBufferForFile, getBufferForFileImpl. We introduce a new virtual function openFileForReadBinary which defaults to openFileForRead except in RealFileSystem which uses the OF_None flag instead of OF_Text. The default is set to OF_Text instead of OF_None, this change in value does not affect any other platforms other than z/OS. Setting this parameter correctly is required to open files on z/OS in the correct encoding. The IsText parameter is based on the context of where we open files, for example, in the ASTReader, HeaderMap requires that files always be opened in binary even though they might be tagged as text. --- clang/include/clang/Basic/FileManager.h | 20 ++-- .../include/clang/Basic/FileSystemStatCache.h | 8 +- clang/lib/Basic/FileManager.cpp | 34 +++--- clang/lib/Basic/FileSystemStatCache.cpp | 14 ++- clang/lib/Lex/HeaderMap.cpp | 5 +- clang/lib/Lex/PPDirectives.cpp | 16 +-- clang/lib/Serialization/ASTReader.cpp | 4 +- clang/test/Preprocessor/embed_zos.c | 109 ++++++++++++++++++ llvm/include/llvm/Support/VirtualFileSystem.h | 17 ++- llvm/lib/Support/VirtualFileSystem.cpp | 31 +++-- 10 files changed, 205 insertions(+), 53 deletions(-) create mode 100644 clang/test/Preprocessor/embed_zos.c diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h index ce4e8c1fbe16..6cc6c2bfd2b6 100644 --- a/clang/include/clang/Basic/FileManager.h +++ b/clang/include/clang/Basic/FileManager.h @@ -124,8 +124,8 @@ class FileManager : public RefCountedBase { std::unique_ptr StatCache; std::error_code getStatValue(StringRef Path, llvm::vfs::Status &Status, - bool isFile, - std::unique_ptr *F); + bool isFile, std::unique_ptr *F, + bool IsText = true); /// Add all ancestors of the given path (pointing to either a file /// or a directory) as virtual directories. @@ -230,7 +230,8 @@ public: /// the failure to find this file. llvm::Expected getFileRef(StringRef Filename, bool OpenFile = false, - bool CacheFailure = true); + bool CacheFailure = true, + bool IsText = true); /// Get the FileEntryRef for stdin, returning an error if stdin cannot be /// read. @@ -290,23 +291,28 @@ public: /// Open the specified file as a MemoryBuffer, returning a new /// MemoryBuffer if successful, otherwise returning null. + /// The IsText parameter controls whether the file should be opened as a text + /// or binary file, and should be set to false if the file contents should be + /// treated as binary. llvm::ErrorOr> getBufferForFile(FileEntryRef Entry, bool isVolatile = false, bool RequiresNullTerminator = true, - std::optional MaybeLimit = std::nullopt); + std::optional MaybeLimit = std::nullopt, + bool IsText = true); llvm::ErrorOr> getBufferForFile(StringRef Filename, bool isVolatile = false, bool RequiresNullTerminator = true, - std::optional MaybeLimit = std::nullopt) const { + std::optional MaybeLimit = std::nullopt, + bool IsText = true) const { return getBufferForFileImpl(Filename, /*FileSize=*/MaybeLimit.value_or(-1), - isVolatile, RequiresNullTerminator); + isVolatile, RequiresNullTerminator, IsText); } private: llvm::ErrorOr> getBufferForFileImpl(StringRef Filename, int64_t FileSize, bool isVolatile, - bool RequiresNullTerminator) const; + bool RequiresNullTerminator, bool IsText) const; DirectoryEntry *&getRealDirEntry(const llvm::vfs::Status &Status); diff --git a/clang/include/clang/Basic/FileSystemStatCache.h b/clang/include/clang/Basic/FileSystemStatCache.h index 5a003a748178..73c256a01697 100644 --- a/clang/include/clang/Basic/FileSystemStatCache.h +++ b/clang/include/clang/Basic/FileSystemStatCache.h @@ -48,10 +48,10 @@ public: /// success for directories (not files). On a successful file lookup, the /// implementation can optionally fill in \p F with a valid \p File object and /// the client guarantees that it will close it. - static std::error_code - get(StringRef Path, llvm::vfs::Status &Status, bool isFile, - std::unique_ptr *F, - FileSystemStatCache *Cache, llvm::vfs::FileSystem &FS); + static std::error_code get(StringRef Path, llvm::vfs::Status &Status, + bool isFile, std::unique_ptr *F, + FileSystemStatCache *Cache, + llvm::vfs::FileSystem &FS, bool IsText = true); protected: // FIXME: The pointer here is a non-owning/optional reference to the diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 6097b85a0306..2876c290a26b 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -212,8 +212,10 @@ FileManager::getFile(StringRef Filename, bool openFile, bool CacheFailure) { return llvm::errorToErrorCode(Result.takeError()); } -llvm::Expected -FileManager::getFileRef(StringRef Filename, bool openFile, bool CacheFailure) { +llvm::Expected FileManager::getFileRef(StringRef Filename, + bool openFile, + bool CacheFailure, + bool IsText) { ++NumFileLookups; // See if there is already an entry in the map. @@ -259,7 +261,7 @@ FileManager::getFileRef(StringRef Filename, bool openFile, bool CacheFailure) { std::unique_ptr F; llvm::vfs::Status Status; auto statError = getStatValue(InterndFileName, Status, true, - openFile ? &F : nullptr); + openFile ? &F : nullptr, IsText); if (statError) { // There's no real file at the given path. if (CacheFailure) @@ -531,7 +533,7 @@ void FileManager::fillRealPathName(FileEntry *UFE, llvm::StringRef FileName) { llvm::ErrorOr> FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, bool RequiresNullTerminator, - std::optional MaybeLimit) { + std::optional MaybeLimit, bool IsText) { const FileEntry *Entry = &FE.getFileEntry(); // If the content is living on the file entry, return a reference to it. if (Entry->Content) @@ -558,21 +560,21 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, // Otherwise, open the file. return getBufferForFileImpl(Filename, FileSize, isVolatile, - RequiresNullTerminator); + RequiresNullTerminator, IsText); } llvm::ErrorOr> FileManager::getBufferForFileImpl(StringRef Filename, int64_t FileSize, - bool isVolatile, - bool RequiresNullTerminator) const { + bool isVolatile, bool RequiresNullTerminator, + bool IsText) const { if (FileSystemOpts.WorkingDir.empty()) return FS->getBufferForFile(Filename, FileSize, RequiresNullTerminator, - isVolatile); + isVolatile, IsText); SmallString<128> FilePath(Filename); FixupRelativePath(FilePath); return FS->getBufferForFile(FilePath, FileSize, RequiresNullTerminator, - isVolatile); + isVolatile, IsText); } /// getStatValue - Get the 'stat' information for the specified path, @@ -580,20 +582,22 @@ FileManager::getBufferForFileImpl(StringRef Filename, int64_t FileSize, /// if the path points to a virtual file or does not exist, or returns /// false if it's an existent real file. If FileDescriptor is NULL, /// do directory look-up instead of file look-up. -std::error_code -FileManager::getStatValue(StringRef Path, llvm::vfs::Status &Status, - bool isFile, std::unique_ptr *F) { +std::error_code FileManager::getStatValue(StringRef Path, + llvm::vfs::Status &Status, + bool isFile, + std::unique_ptr *F, + bool IsText) { // FIXME: FileSystemOpts shouldn't be passed in here, all paths should be // absolute! if (FileSystemOpts.WorkingDir.empty()) - return FileSystemStatCache::get(Path, Status, isFile, F, - StatCache.get(), *FS); + return FileSystemStatCache::get(Path, Status, isFile, F, StatCache.get(), + *FS, IsText); SmallString<128> FilePath(Path); FixupRelativePath(FilePath); return FileSystemStatCache::get(FilePath.c_str(), Status, isFile, F, - StatCache.get(), *FS); + StatCache.get(), *FS, IsText); } std::error_code diff --git a/clang/lib/Basic/FileSystemStatCache.cpp b/clang/lib/Basic/FileSystemStatCache.cpp index 415a4e2025df..183eea098663 100644 --- a/clang/lib/Basic/FileSystemStatCache.cpp +++ b/clang/lib/Basic/FileSystemStatCache.cpp @@ -30,11 +30,12 @@ void FileSystemStatCache::anchor() {} /// success for directories (not files). On a successful file lookup, the /// implementation can optionally fill in FileDescriptor with a valid /// descriptor and the client guarantees that it will close it. -std::error_code -FileSystemStatCache::get(StringRef Path, llvm::vfs::Status &Status, - bool isFile, std::unique_ptr *F, - FileSystemStatCache *Cache, - llvm::vfs::FileSystem &FS) { +std::error_code FileSystemStatCache::get(StringRef Path, + llvm::vfs::Status &Status, bool isFile, + std::unique_ptr *F, + FileSystemStatCache *Cache, + llvm::vfs::FileSystem &FS, + bool IsText) { bool isForDir = !isFile; std::error_code RetCode; @@ -58,7 +59,8 @@ FileSystemStatCache::get(StringRef Path, llvm::vfs::Status &Status, // // Because of this, check to see if the file exists with 'open'. If the // open succeeds, use fstat to get the stat info. - auto OwnedFile = FS.openFileForRead(Path); + auto OwnedFile = + IsText ? FS.openFileForRead(Path) : FS.openFileForReadBinary(Path); if (!OwnedFile) { // If the open fails, our "stat" fails. diff --git a/clang/lib/Lex/HeaderMap.cpp b/clang/lib/Lex/HeaderMap.cpp index 00bf880726ee..b04f67a4b2ed 100644 --- a/clang/lib/Lex/HeaderMap.cpp +++ b/clang/lib/Lex/HeaderMap.cpp @@ -54,7 +54,10 @@ std::unique_ptr HeaderMap::Create(FileEntryRef FE, FileManager &FM) { unsigned FileSize = FE.getSize(); if (FileSize <= sizeof(HMapHeader)) return nullptr; - auto FileBuffer = FM.getBufferForFile(FE); + auto FileBuffer = + FM.getBufferForFile(FE, /*IsVolatile=*/false, + /*RequiresNullTerminator=*/true, + /*MaybeList=*/std::nullopt, /*IsText=*/false); if (!FileBuffer || !*FileBuffer) return nullptr; bool NeedsByteSwap; diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 4e77df9ec444..8e7d80aa8911 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1080,8 +1080,8 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile, FileManager &FM = this->getFileManager(); if (llvm::sys::path::is_absolute(Filename)) { // lookup path or immediately fail - llvm::Expected ShouldBeEntry = - FM.getFileRef(Filename, OpenFile); + llvm::Expected ShouldBeEntry = FM.getFileRef( + Filename, OpenFile, /*CacheFailure=*/true, /*IsText=*/false); return llvm::expectedToOptional(std::move(ShouldBeEntry)); } @@ -1107,8 +1107,8 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile, StringRef FullFileDir = LookupFromFile->tryGetRealPathName(); if (!FullFileDir.empty()) { SeparateComponents(LookupPath, FullFileDir, Filename, true); - llvm::Expected ShouldBeEntry = - FM.getFileRef(LookupPath, OpenFile); + llvm::Expected ShouldBeEntry = FM.getFileRef( + LookupPath, OpenFile, /*CacheFailure=*/true, /*IsText=*/false); if (ShouldBeEntry) return llvm::expectedToOptional(std::move(ShouldBeEntry)); llvm::consumeError(ShouldBeEntry.takeError()); @@ -1123,8 +1123,8 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile, StringRef WorkingDir = WorkingDirEntry.getName(); if (!WorkingDir.empty()) { SeparateComponents(LookupPath, WorkingDir, Filename, false); - llvm::Expected ShouldBeEntry = - FM.getFileRef(LookupPath, OpenFile); + llvm::Expected ShouldBeEntry = FM.getFileRef( + LookupPath, OpenFile, /*CacheFailure=*/true, /*IsText=*/false); if (ShouldBeEntry) return llvm::expectedToOptional(std::move(ShouldBeEntry)); llvm::consumeError(ShouldBeEntry.takeError()); @@ -1135,8 +1135,8 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile, for (const auto &Entry : PPOpts->EmbedEntries) { LookupPath.clear(); SeparateComponents(LookupPath, Entry, Filename, false); - llvm::Expected ShouldBeEntry = - FM.getFileRef(LookupPath, OpenFile); + llvm::Expected ShouldBeEntry = FM.getFileRef( + LookupPath, OpenFile, /*CacheFailure=*/true, /*IsText=*/false); if (ShouldBeEntry) return llvm::expectedToOptional(std::move(ShouldBeEntry)); llvm::consumeError(ShouldBeEntry.takeError()); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 1cf6c9352f36..60b708067dc5 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -5334,7 +5334,9 @@ std::string ASTReader::getOriginalSourceFile( const PCHContainerReader &PCHContainerRdr, DiagnosticsEngine &Diags) { // Open the AST file. auto Buffer = FileMgr.getBufferForFile(ASTFileName, /*IsVolatile=*/false, - /*RequiresNullTerminator=*/false); + /*RequiresNullTerminator=*/false, + /*MaybeLimit=*/std::nullopt, + /*IsText=*/false); if (!Buffer) { Diags.Report(diag::err_fe_unable_to_read_pch_file) << ASTFileName << Buffer.getError().message(); diff --git a/clang/test/Preprocessor/embed_zos.c b/clang/test/Preprocessor/embed_zos.c new file mode 100644 index 000000000000..564a65f42afc --- /dev/null +++ b/clang/test/Preprocessor/embed_zos.c @@ -0,0 +1,109 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t/media && cp %S/Inputs/media/art.txt %t/media/ +// RUN: chtag -r %t/media/art.txt +// RUN: %clang_cc1 -std=c23 %s -fsyntax-only --embed-dir=%t -verify +// expected-no-diagnostics + +// REQUIRES: shell, system-zos + +const char data[] = { +#embed +}; +const char data2[] = { +#embed +, 0 +}; +const char data3[] = { +#embed suffix(, 0) +}; +const char data4[] = { +#embed suffix(,) +0 +}; +static_assert(sizeof(data) == 274); +static_assert(' ' == data[0]); +static_assert('_' == data[11]); +static_assert('\n' == data[273]); +static_assert(sizeof(data2) == 275); +static_assert(' ' == data2[0]); +static_assert('_' == data2[11]); +static_assert('\n' == data2[273]); +static_assert('\0' == data2[274]); +static_assert(sizeof(data3) == 275); +static_assert(' ' == data3[0]); +static_assert('_' == data3[11]); +static_assert('\n' == data3[273]); +static_assert('\0' == data3[274]); +static_assert(sizeof(data4) == 275); +static_assert(' ' == data4[0]); +static_assert('_' == data4[11]); +static_assert('\n' == data4[273]); +static_assert('\0' == data4[274]); + +const signed char data5[] = { +#embed +}; +const signed char data6[] = { +#embed +, 0 +}; +const signed char data7[] = { +#embed suffix(, 0) +}; +const signed char data8[] = { +#embed suffix(,) +0 +}; +static_assert(sizeof(data5) == 274); +static_assert(' ' == data5[0]); +static_assert('_' == data5[11]); +static_assert('\n' == data5[273]); +static_assert(sizeof(data6) == 275); +static_assert(' ' == data6[0]); +static_assert('_' == data6[11]); +static_assert('\n' == data6[273]); +static_assert('\0' == data6[274]); +static_assert(sizeof(data7) == 275); +static_assert(' ' == data7[0]); +static_assert('_' == data7[11]); +static_assert('\n' == data7[273]); +static_assert('\0' == data7[274]); +static_assert(sizeof(data8) == 275); +static_assert(' ' == data8[0]); +static_assert('_' == data8[11]); +static_assert('\n' == data8[273]); +static_assert('\0' == data8[274]); + +const unsigned char data9[] = { +#embed +}; +const unsigned char data10[] = { +0, +#embed +}; +const unsigned char data11[] = { +#embed prefix(0,) +}; +const unsigned char data12[] = { +0 +#embed prefix(,) +}; +static_assert(sizeof(data9) == 274); +static_assert(' ' == data9[0]); +static_assert('_' == data9[11]); +static_assert('\n' == data9[273]); +static_assert(sizeof(data10) == 275); +static_assert(' ' == data10[1]); +static_assert('_' == data10[12]); +static_assert('\n' == data10[274]); +static_assert('\0' == data10[0]); +static_assert(sizeof(data11) == 275); +static_assert(' ' == data11[1]); +static_assert('_' == data11[12]); +static_assert('\n' == data11[274]); +static_assert('\0' == data11[0]); +static_assert(sizeof(data12) == 275); +static_assert(' ' == data12[1]); +static_assert('_' == data12[12]); +static_assert('\n' == data12[274]); +static_assert('\0' == data12[0]); diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index 2531c075f262..1358e880942a 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -271,15 +271,28 @@ public: /// Get the status of the entry at \p Path, if one exists. virtual llvm::ErrorOr status(const Twine &Path) = 0; - /// Get a \p File object for the file at \p Path, if one exists. + /// Get a \p File object for the text file at \p Path, if one exists. virtual llvm::ErrorOr> openFileForRead(const Twine &Path) = 0; + /// Get a \p File object for the binary file at \p Path, if one exists. + /// Some non-ascii based file systems perform encoding conversions + /// when reading as a text file, and this function should be used if + /// a file's bytes should be read as-is. On most filesystems, this + /// is the same behaviour as openFileForRead. + virtual llvm::ErrorOr> + openFileForReadBinary(const Twine &Path) { + return openFileForRead(Path); + } + /// This is a convenience method that opens a file, gets its content and then /// closes the file. + /// The IsText parameter is used to distinguish whether the file should be + /// opened as a binary or text file. llvm::ErrorOr> getBufferForFile(const Twine &Name, int64_t FileSize = -1, - bool RequiresNullTerminator = true, bool IsVolatile = false); + bool RequiresNullTerminator = true, bool IsVolatile = false, + bool IsText = true); /// Get a directory_iterator for \p Dir. /// \note The 'end' iterator is directory_iterator(). diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 3e79ecf2fc7e..b3cdaa3eefc9 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -117,8 +117,9 @@ FileSystem::~FileSystem() = default; ErrorOr> FileSystem::getBufferForFile(const llvm::Twine &Name, int64_t FileSize, - bool RequiresNullTerminator, bool IsVolatile) { - auto F = openFileForRead(Name); + bool RequiresNullTerminator, bool IsVolatile, + bool IsText) { + auto F = IsText ? openFileForRead(Name) : openFileForReadBinary(Name); if (!F) return F.getError(); @@ -279,6 +280,8 @@ public: ErrorOr status(const Twine &Path) override; ErrorOr> openFileForRead(const Twine &Path) override; + ErrorOr> + openFileForReadBinary(const Twine &Path) override; directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override; llvm::ErrorOr getCurrentWorkingDirectory() const override; @@ -302,6 +305,17 @@ private: return Storage; } + ErrorOr> + openFileForReadWithFlags(const Twine &Name, sys::fs::OpenFlags Flags) { + SmallString<256> RealName, Storage; + Expected FDOrErr = sys::fs::openNativeFileForRead( + adjustPath(Name, Storage), Flags, &RealName); + if (!FDOrErr) + return errorToErrorCode(FDOrErr.takeError()); + return std::unique_ptr( + new RealFile(*FDOrErr, Name.str(), RealName.str())); + } + struct WorkingDirectory { // The current working directory, without symlinks resolved. (echo $PWD). SmallString<128> Specified; @@ -324,13 +338,12 @@ ErrorOr RealFileSystem::status(const Twine &Path) { ErrorOr> RealFileSystem::openFileForRead(const Twine &Name) { - SmallString<256> RealName, Storage; - Expected FDOrErr = sys::fs::openNativeFileForRead( - adjustPath(Name, Storage), sys::fs::OF_None, &RealName); - if (!FDOrErr) - return errorToErrorCode(FDOrErr.takeError()); - return std::unique_ptr( - new RealFile(*FDOrErr, Name.str(), RealName.str())); + return openFileForReadWithFlags(Name, sys::fs::OF_Text); +} + +ErrorOr> +RealFileSystem::openFileForReadBinary(const Twine &Name) { + return openFileForReadWithFlags(Name, sys::fs::OF_None); } llvm::ErrorOr RealFileSystem::getCurrentWorkingDirectory() const { -- GitLab From 17ac10c28f0a3c078a82595787da7d855e581bf1 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 21 Oct 2024 13:37:44 +0100 Subject: [PATCH 224/511] Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions" This reverts commit 7f2e937469a8cec3fe977bf41ad2dfb9b4ce648a as it causes regressions in the tests it modifies, and undoes what was added in #100653 (which itself was a fix for a previous regression). --- .../Transforms/Vectorize/SLPVectorizer.cpp | 495 +++++++----------- .../PhaseOrdering/AArch64/slpordering.ll | 42 +- .../SLPVectorizer/AArch64/loadorder.ll | 34 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 11 +- .../SLPVectorizer/AArch64/vec3-calls.ll | 3 +- .../X86/gather-node-same-as-vect-but-order.ll | 15 +- .../SLPVectorizer/X86/horizontal-list.ll | 32 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 10 +- .../X86/non-power-of-2-order-detection.ll | 9 +- .../X86/reorder_with_external_users.ll | 191 +++++++ .../SLPVectorizer/X86/vec3-calls.ll | 3 +- .../X86/vect-gather-same-nodes.ll | 14 +- 12 files changed, 468 insertions(+), 391 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e1aa6127ac03..1098bf578d2d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, if (NumParts == 0 || NumParts >= Sz) return bit_floor(Sz); unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); - if (RegVF > Sz) - return bit_floor(Sz); return (Sz / RegVF) * RegVF; } @@ -1507,12 +1505,6 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; - /// Checks if the graph and all its subgraphs cannot be better vectorized. - /// It may happen, if all gather nodes are loads and they cannot be - /// "clusterized". In this case even subgraphs cannot be vectorized more - /// effectively than the base graph. - bool isTreeNotExtendable() const; - /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, @@ -3055,9 +3047,7 @@ private: /// vector loads/masked gathers instead of regular gathers. Later these loads /// are reshufled to build final gathered nodes. void tryToVectorizeGatheredLoads( - const SmallMapVector, - SmallVector>>, - 8> &GatheredLoads); + ArrayRef>> GatheredLoads); /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. @@ -3069,7 +3059,7 @@ private: /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. - SmallVector> + DenseMap> collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the @@ -4667,8 +4657,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes = true) { - if (getUnderlyingObject(Ptr1, RecursionMaxDepth) != - getUnderlyingObject(Ptr2, RecursionMaxDepth)) + if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) return false; auto *GEP1 = dyn_cast(Ptr1); auto *GEP2 = dyn_cast(Ptr2); @@ -5188,40 +5177,30 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, return LoadsState::Gather; } -static bool clusterSortPtrAccesses(ArrayRef VL, - ArrayRef BBs, Type *ElemTy, +static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { - assert( - all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && - "Expected list of pointer operands."); + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each // Ptr into, sort and return the sorted indices with values next to one // another. - SmallMapVector, - SmallVector>>, 8> - Bases; - Bases - .try_emplace(std::make_pair( - BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth))) - .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U); - - SortedIndices.clear(); - for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) { - auto Key = std::make_pair(BBs[Cnt + 1], - getUnderlyingObject(Ptr, RecursionMaxDepth)); - bool Found = any_of(Bases.try_emplace(Key).first->second, - [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { - std::optional Diff = getPointersDiff( - ElemTy, std::get<0>(Base.front()), ElemTy, - Ptr, DL, SE, - /*StrictCheck=*/true); - if (!Diff) - return false; + MapVector>> Bases; + Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); + + unsigned Cnt = 1; + for (Value *Ptr : VL.drop_front()) { + bool Found = any_of(Bases, [&](auto &Base) { + std::optional Diff = + getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; - Base.emplace_back(Ptr, *Diff, Cnt + 1); - return true; - }); + Base.second.emplace_back(Ptr, *Diff, Cnt++); + return true; + }); if (!Found) { // If we haven't found enough to usefully cluster, return early. @@ -5229,39 +5208,71 @@ static bool clusterSortPtrAccesses(ArrayRef VL, return false; // Not found already - add a new Base - Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1); + Bases[Ptr].emplace_back(Ptr, 0, Cnt++); } } // For each of the bases sort the pointers by Offset and check if any of the // base become consecutively allocated. + bool AnyConsecutive = false; for (auto &Base : Bases) { - for (auto &Vec : Base.second) { - if (Vec.size() > 1) { - stable_sort(Vec, [](const std::tuple &X, - const std::tuple &Y) { - return std::get<1>(X) < std::get<1>(Y); - }); - int InitialOffset = std::get<1>(Vec[0]); - bool AnyConsecutive = - all_of(enumerate(Vec), [InitialOffset](const auto &P) { - return std::get<1>(P.value()) == int(P.index()) + InitialOffset; - }); - // Fill SortedIndices array only if it looks worth-while to sort the - // ptrs. - if (!AnyConsecutive) - return false; - } + auto &Vec = Base.second; + if (Vec.size() > 1) { + llvm::stable_sort(Vec, [](const std::tuple &X, + const std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { + return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + }); } - sort(Base.second, [](const auto &V1, const auto &V2) { - return std::get<2>(V1.front()) < std::get<2>(V2.front()); + } + + // Fill SortedIndices array only if it looks worth-while to sort the ptrs. + SortedIndices.clear(); + if (!AnyConsecutive) + return false; + + // If we have a better order, also sort the base pointers by increasing + // (variable) values if possible, to try and keep the order more regular. In + // order to create a valid strict-weak order we cluster by the Root of gep + // chains and sort within each. + SmallVector> SortedBases; + for (auto &Base : Bases) { + Value *Strip = Base.first->stripInBoundsConstantOffsets(); + Value *Root = Strip; + while (auto *Gep = dyn_cast(Root)) + Root = Gep->getOperand(0); + SortedBases.emplace_back(Base.first, Strip, Root); + } + auto *Begin = SortedBases.begin(); + auto *End = SortedBases.end(); + while (Begin != End) { + Value *Root = std::get<2>(*Begin); + auto *Mid = std::stable_partition( + Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; }); + DenseMap> LessThan; + for (auto *I = Begin; I < Mid; ++I) + LessThan.try_emplace(std::get<1>(*I)); + for (auto *I = Begin; I < Mid; ++I) { + Value *V = std::get<1>(*I); + while (auto *Gep = dyn_cast(V)) { + V = Gep->getOperand(0); + if (LessThan.contains(V)) + LessThan[V][std::get<1>(*I)] = true; + } + } + std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) { + return LessThan[std::get<1>(V1)][std::get<1>(V2)]; }); + Begin = Mid; } - for (auto &T : Bases) - for (const auto &Vec : T.second) - for (const auto &P : Vec) - SortedIndices.push_back(std::get<2>(P)); + // Collect the final order of sorted indices + for (auto Base : SortedBases) + for (auto &T : Bases[std::get<0>(Base)]) + SortedIndices.push_back(std::get<2>(T)); assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); @@ -5275,18 +5286,15 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { SmallVector Ptrs; Ptrs.reserve(TE.Scalars.size()); - SmallVector BBs; - BBs.reserve(TE.Scalars.size()); for (Value *V : TE.Scalars) { auto *L = dyn_cast(V); if (!L || !L->isSimple()) return std::nullopt; Ptrs.push_back(L->getPointerOperand()); - BBs.push_back(L->getParent()); } BoUpSLP::OrdersType Order; - if (clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order)) + if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) return std::move(Order); return std::nullopt; } @@ -5654,7 +5662,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars // has been auditted for correctness with non-power-of-two vectors. - if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) + if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; } @@ -6385,15 +6393,13 @@ void BoUpSLP::buildExternalUses( } } -SmallVector> +DenseMap> BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { - SmallDenseMap, - SmallVector, 8> - PtrToStoresMap; + DenseMap> PtrToStoresMap; for (unsigned Lane : seq(0, TE->Scalars.size())) { Value *V = TE->Scalars[Lane]; // Don't iterate over the users of constant data. - if (!isa(V)) + if (isa(V)) continue; // To save compilation time we don't visit if we have too many users. if (V->hasNUsesOrMore(UsesLimit)) @@ -6411,34 +6417,25 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { if (getTreeEntry(U)) continue; - Value *Ptr = - getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth); - auto &StoresVec = PtrToStoresMap[{SI->getParent(), - SI->getValueOperand()->getType(), Ptr}]; + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + auto &StoresVec = PtrToStoresMap[Ptr]; // For now just keep one store per pointer object per lane. // TODO: Extend this to support multiple stores per pointer per lane if (StoresVec.size() > Lane) continue; - if (!StoresVec.empty()) { - std::optional Diff = getPointersDiff( - SI->getValueOperand()->getType(), SI->getPointerOperand(), - SI->getValueOperand()->getType(), - StoresVec.front()->getPointerOperand(), *DL, *SE, - /*StrictCheck=*/true); - // We failed to compare the pointers so just abandon this store. - if (!Diff) - continue; - } + // Skip if in different BBs. + if (!StoresVec.empty() && + SI->getParent() != StoresVec.back()->getParent()) + continue; + // Make sure that the stores are of the same type. + if (!StoresVec.empty() && + SI->getValueOperand()->getType() != + StoresVec.back()->getValueOperand()->getType()) + continue; StoresVec.push_back(SI); } } - SmallVector> Res(PtrToStoresMap.size()); - unsigned I = 0; - for (auto &P : PtrToStoresMap) { - Res[I].swap(P.second); - ++I; - } - return Res; + return PtrToStoresMap; } bool BoUpSLP::canFormVector(ArrayRef StoresVec, @@ -6448,9 +6445,9 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, // To avoid calling getPointersDiff() while sorting we create a vector of // pairs {store, offset from first} and sort this instead. - SmallVector> StoreOffsetVec; + SmallVector> StoreOffsetVec(StoresVec.size()); StoreInst *S0 = StoresVec[0]; - StoreOffsetVec.emplace_back(0, 0); + StoreOffsetVec[0] = {S0, 0}; Type *S0Ty = S0->getValueOperand()->getType(); Value *S0Ptr = S0->getPointerOperand(); for (unsigned Idx : seq(1, StoresVec.size())) { @@ -6459,36 +6456,41 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); - StoreOffsetVec.emplace_back(*Diff, Idx); + // We failed to compare the pointers so just abandon this StoresVec. + if (!Diff) + return false; + StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; } + // Sort the vector based on the pointers. We create a copy because we may + // need the original later for calculating the reorder (shuffle) indices. + stable_sort(StoreOffsetVec, [](const std::pair &Pair1, + const std::pair &Pair2) { + int Offset1 = Pair1.second; + int Offset2 = Pair2.second; + return Offset1 < Offset2; + }); + // Check if the stores are consecutive by checking if their difference is 1. - if (StoreOffsetVec.size() != StoresVec.size()) - return false; - sort(StoreOffsetVec, - [](const std::pair &L, - const std::pair &R) { return L.first < R.first; }); - unsigned Idx = 0; - int PrevDist = 0; - for (const auto &P : StoreOffsetVec) { - if (Idx > 0 && P.first != PrevDist + 1) + for (unsigned Idx : seq(1, StoreOffsetVec.size())) + if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) return false; - PrevDist = P.first; - ++Idx; - } // Calculate the shuffle indices according to their offset against the sorted // StoreOffsetVec. - ReorderIndices.assign(StoresVec.size(), 0); - bool IsIdentity = true; - for (auto [I, P] : enumerate(StoreOffsetVec)) { - ReorderIndices[P.second] = I; - IsIdentity &= P.second == I; + ReorderIndices.reserve(StoresVec.size()); + for (StoreInst *SI : StoresVec) { + unsigned Idx = find_if(StoreOffsetVec, + [SI](const std::pair &Pair) { + return Pair.first == SI; + }) - + StoreOffsetVec.begin(); + ReorderIndices.push_back(Idx); } // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in // reorderTopToBottom() and reorderBottomToTop(), so we are following the // same convention here. - if (IsIdentity) + if (isIdentityOrder(ReorderIndices)) ReorderIndices.clear(); return true; @@ -6506,7 +6508,8 @@ SmallVector BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { unsigned NumLanes = TE->Scalars.size(); - SmallVector> Stores = collectUserStores(TE); + DenseMap> PtrToStoresMap = + collectUserStores(TE); // Holds the reorder indices for each candidate store vector that is a user of // the current TreeEntry. @@ -6515,7 +6518,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { // Now inspect the stores collected per pointer and look for vectorization // candidates. For each candidate calculate the reorder index vector and push // it into `ExternalReorderIndices` - for (ArrayRef StoresVec : Stores) { + for (const auto &Pair : PtrToStoresMap) { + auto &StoresVec = Pair.second; // If we have fewer than NumLanes stores, then we can't form a vector. if (StoresVec.size() != NumLanes) continue; @@ -6570,13 +6574,9 @@ static void gatherPossiblyVectorizableLoads( continue; bool IsFound = false; for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) { - assert(LI->getParent() == Data.front().first->getParent() && - LI->getType() == Data.front().first->getType() && - getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) == - getUnderlyingObject(Data.front().first->getPointerOperand(), - RecursionMaxDepth) && - "Expected loads with the same type, same parent and same " - "underlying pointer."); + if (LI->getParent() != Data.front().first->getParent() || + LI->getType() != Data.front().first->getType()) + continue; std::optional Dist = getPointersDiff( LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), Data.front().first->getPointerOperand(), DL, SE, @@ -6704,9 +6704,7 @@ static void gatherPossiblyVectorizableLoads( } void BoUpSLP::tryToVectorizeGatheredLoads( - const SmallMapVector, - SmallVector>>, - 8> &GatheredLoads) { + ArrayRef>> GatheredLoads) { GatheredLoadsEntriesFirst = VectorizableTree.size(); SmallVector> LoadSetsToVectorize( @@ -6739,10 +6737,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( SmallVector CandidateVFs; if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1)) CandidateVFs.push_back(MaxVF); - for (int NumElts = getFloorFullVectorNumberOfElements( - *TTI, Loads.front()->getType(), MaxVF); - NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( - *TTI, Loads.front()->getType(), NumElts - 1)) { + for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) { CandidateVFs.push_back(NumElts); if (VectorizeNonPowerOf2 && NumElts > 2) CandidateVFs.push_back(NumElts - 1); @@ -6756,10 +6751,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads( if (Final && NumElts > BestVF) continue; SmallVector MaskedGatherVectorized; - for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; ++Cnt) { - ArrayRef Slice = - ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt)); + ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); if (VectorizedLoads.count(Slice.front()) || VectorizedLoads.count(Slice.back()) || areKnownNonVectorizableLoads(Slice)) @@ -7105,27 +7099,24 @@ void BoUpSLP::tryToVectorizeGatheredLoads( } return NonVectorized; }; - for (const auto &GLs : GatheredLoads) { - const auto &Ref = GLs.second; - SmallVector NonVectorized = ProcessGatheredLoads(Ref); - if (!Ref.empty() && !NonVectorized.empty() && - std::accumulate( - Ref.begin(), Ref.end(), 0u, - [](unsigned S, ArrayRef> LoadsDists) { - return S + LoadsDists.size(); - }) != NonVectorized.size() && - IsMaskedGatherSupported(NonVectorized)) { - SmallVector>> FinalGatheredLoads; - for (LoadInst *LI : NonVectorized) { - // Reinsert non-vectorized loads to other list of loads with the same - // base pointers. - gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, - FinalGatheredLoads, - /*AddNew=*/false); - } - // Final attempt to vectorize non-vectorized loads. - (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); - } + SmallVector NonVectorized = ProcessGatheredLoads(GatheredLoads); + if (!GatheredLoads.empty() && !NonVectorized.empty() && + std::accumulate( + GatheredLoads.begin(), GatheredLoads.end(), 0u, + [](unsigned S, ArrayRef> LoadsDists) { + return S + LoadsDists.size(); + }) != NonVectorized.size() && + IsMaskedGatherSupported(NonVectorized)) { + SmallVector>> FinalGatheredLoads; + for (LoadInst *LI : NonVectorized) { + // Reinsert non-vectorized loads to other list of loads with the same + // base pointers. + gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, + FinalGatheredLoads, + /*AddNew=*/false); + } + // Final attempt to vectorize non-vectorized loads. + (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); } // Try to vectorize postponed load entries, previously marked as gathered. for (unsigned Idx : LoadEntriesToVectorize) { @@ -7372,6 +7363,13 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) { assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); + if (S.MainOp->getType()->isFloatingPointTy() && + TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { + auto *I = dyn_cast(V); + return I && (I->isBinaryOp() || isa(I)) && !I->isFast(); + })) + return TreeEntry::NeedToGather; + unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto *VL0 = cast(S.OpValue); @@ -7536,12 +7534,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::Or: case Instruction::Xor: case Instruction::Freeze: - if (S.MainOp->getType()->isFloatingPointTy() && - TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { - auto *I = dyn_cast(V); - return I && I->isBinaryOp() && !I->isFast(); - })) - return TreeEntry::NeedToGather; return TreeEntry::Vectorize; case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. @@ -7633,12 +7625,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::NeedToGather; } case Instruction::Call: { - if (S.MainOp->getType()->isFloatingPointTy() && - TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { - auto *I = dyn_cast(V); - return I && !I->isFast(); - })) - return TreeEntry::NeedToGather; // Check if the calls are all to the same vectorizable intrinsic or // library function. CallInst *CI = cast(VL0); @@ -9358,13 +9344,8 @@ void BoUpSLP::transformNodes() { // insertvector instructions. unsigned StartIdx = 0; unsigned End = VL.size(); - for (unsigned VF = getFloorFullVectorNumberOfElements( - *TTI, VL.front()->getType(), VL.size() - 1); - VF >= MinVF; VF = getFloorFullVectorNumberOfElements( - *TTI, VL.front()->getType(), VF - 1)) { - if (StartIdx + VF > End) - continue; - SmallVector> Slices; + for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) { + SmallVector Slices; for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. @@ -9394,10 +9375,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || - (S.getOpcode() == Instruction::Load && - areKnownNonVectorizableLoads(Slice)) || - (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) + if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice)) continue; if (VF == 2) { // Try to vectorize reduced values or if all users are vectorized. @@ -9417,16 +9395,8 @@ void BoUpSLP::transformNodes() { canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); // Do not vectorize gathers. if (Res == LoadsState::ScatterVectorize || - Res == LoadsState::Gather) { - if (Res == LoadsState::Gather) { - registerNonVectorizableLoads(Slice); - // If reductions and the scalars from the root node are - // analyzed - mark as non-vectorizable reduction. - if (UserIgnoreList && E.Idx == 0) - analyzedReductionVals(Slice); - } + Res == LoadsState::Gather) continue; - } } else if (S.getOpcode() == Instruction::ExtractElement || (TTI->getInstructionCost( cast(Slice.front()), CostKind) < @@ -9441,17 +9411,17 @@ void BoUpSLP::transformNodes() { } } } - Slices.emplace_back(Cnt, Slice.size()); + Slices.emplace_back(Cnt); } - auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { + auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) { E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); if (StartIdx == Cnt) - StartIdx = Cnt + Sz; - if (End == Cnt + Sz) + StartIdx = Cnt + VF; + if (End == Cnt + VF) End = Cnt; }; - for (auto [Cnt, Sz] : Slices) { - ArrayRef Slice = VL.slice(Cnt, Sz); + for (unsigned Cnt : Slices) { + ArrayRef Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. if (TreeEntry *SE = getTreeEntry(Slice.front()); SE || getTreeEntry(Slice.back())) { @@ -9460,7 +9430,7 @@ void BoUpSLP::transformNodes() { if (VF != SE->getVectorFactor() || !SE->isSame(Slice)) continue; SE->UserTreeIndices.emplace_back(&E, UINT_MAX); - AddCombinedNode(SE->Idx, Cnt, Sz); + AddCombinedNode(SE->Idx, Cnt); continue; } unsigned PrevSize = VectorizableTree.size(); @@ -9472,14 +9442,12 @@ void BoUpSLP::transformNodes() { VectorizableTree[PrevSize]->getOpcode() != Instruction::ExtractElement && !isSplat(Slice)) { - if (UserIgnoreList && E.Idx == 0 && VF == 2) - analyzedReductionVals(Slice); VectorizableTree.pop_back(); assert(PrevEntriesSize == LoadEntriesToVectorize.size() && "LoadEntriesToVectorize expected to remain the same"); continue; } - AddCombinedNode(PrevSize, Cnt, Sz); + AddCombinedNode(PrevSize, Cnt); } } } @@ -9574,24 +9542,11 @@ void BoUpSLP::transformNodes() { VectorizableTree.front()->Scalars.size() == SmallVF) || (VectorizableTree.size() <= 2 && UserIgnoreList)) return; - - if (VectorizableTree.front()->isNonPowOf2Vec() && - getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && - getCanonicalGraphSize() <= SmallTree && - count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), - [](const std::unique_ptr &TE) { - return TE->isGather() && - TE->getOpcode() == Instruction::Load && - !allSameBlock(TE->Scalars); - }) == 1) - return; } // A list of loads to be gathered during the vectorization process. We can // try to vectorize them at the end, if profitable. - SmallMapVector, - SmallVector>>, 8> - GatheredLoads; + SmallVector>> GatheredLoads; for (std::unique_ptr &TE : VectorizableTree) { TreeEntry &E = *TE; @@ -9603,21 +9558,9 @@ void BoUpSLP::transformNodes() { !isVectorized(V) && !isDeleted(cast(V)); }))) && - !isSplat(E.Scalars)) { - for (Value *V : E.Scalars) { - auto *LI = dyn_cast(V); - if (!LI) - continue; - if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple()) - continue; - gatherPossiblyVectorizableLoads( - *this, V, *DL, *SE, *TTI, - GatheredLoads[std::make_tuple( - LI->getParent(), - getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth), - LI->getType())]); - } - } + !isSplat(E.Scalars)) + gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI, + GatheredLoads); } // Try to vectorize gathered loads if this is not just a gather of loads. if (!GatheredLoads.empty()) @@ -11572,34 +11515,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } -bool BoUpSLP::isTreeNotExtendable() const { - if (getCanonicalGraphSize() != getTreeSize()) { - constexpr unsigned SmallTree = 3; - if (VectorizableTree.front()->isNonPowOf2Vec() && - getCanonicalGraphSize() <= SmallTree && - count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), - [](const std::unique_ptr &TE) { - return TE->isGather() && - TE->getOpcode() == Instruction::Load && - !allSameBlock(TE->Scalars); - }) == 1) - return true; - return false; - } - bool Res = false; - for (unsigned Idx : seq(getTreeSize())) { - TreeEntry &E = *VectorizableTree[Idx]; - if (!E.isGather()) - continue; - if (E.getOpcode() && E.getOpcode() != Instruction::Load) - return false; - if (isSplat(E.Scalars) || allConstant(E.Scalars)) - continue; - Res = true; - } - return Res; -} - InstructionCost BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, @@ -18856,8 +18771,7 @@ public: auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { Key = hash_combine(hash_value(LI->getParent()), Key); - Value *Ptr = - getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); + Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); if (!LoadKeyUsed.insert(Key).second) { auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); if (LIt != LoadsMap.end()) { @@ -19180,28 +19094,8 @@ public: RegMaxNumber * RedValsMaxNumber); unsigned ReduxWidth = NumReducedVals; - auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { - unsigned NumParts, NumRegs; - Type *ScalarTy = Candidates.front()->getType(); - ReduxWidth = - getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); - VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); - NumParts = TTI.getNumberOfParts(Tp); - NumRegs = - TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); - while (NumParts > NumRegs) { - ReduxWidth = bit_floor(ReduxWidth - 1); - VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); - NumParts = TTI.getNumberOfParts(Tp); - NumRegs = - TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); - } - if (NumParts > NumRegs / 2) - ReduxWidth = bit_floor(ReduxWidth); - return ReduxWidth; - }; if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) - ReduxWidth = GetVectorFactor(ReduxWidth); + ReduxWidth = bit_floor(ReduxWidth); ReduxWidth = std::min(ReduxWidth, MaxElts); unsigned Start = 0; @@ -19209,7 +19103,10 @@ public: // Restarts vectorization attempt with lower vector factor. unsigned PrevReduxWidth = ReduxWidth; bool CheckForReusedReductionOpsLocal = false; - auto AdjustReducedVals = [&](bool IgnoreVL = false) { + auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, + &CheckForReusedReductionOpsLocal, + &PrevReduxWidth, &V, + &IgnoreList](bool IgnoreVL = false) { bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { // Check if any of the reduction ops are gathered. If so, worth @@ -19220,13 +19117,10 @@ public: if (Pos < NumReducedVals - ReduxWidth + 1) return IsAnyRedOpGathered; Pos = Start; - --ReduxWidth; - if (ReduxWidth > 1) - ReduxWidth = GetVectorFactor(ReduxWidth); + ReduxWidth = bit_ceil(ReduxWidth) / 2; return IsAnyRedOpGathered; }; bool AnyVectorized = false; - SmallDenseSet, 8> IgnoredCandidates; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -19238,15 +19132,8 @@ public: } PrevReduxWidth = ReduxWidth; ArrayRef VL(std::next(Candidates.begin(), Pos), ReduxWidth); - // Been analyzed already - skip. - if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) || - (!has_single_bit(ReduxWidth) && - (IgnoredCandidates.contains( - std::make_pair(Pos, bit_floor(ReduxWidth))) || - IgnoredCandidates.contains( - std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)), - bit_floor(ReduxWidth))))) || - V.areAnalyzedReductionVals(VL)) { + // Beeing analyzed already - skip. + if (V.areAnalyzedReductionVals(VL)) { (void)AdjustReducedVals(/*IgnoreVL=*/true); continue; } @@ -19352,24 +19239,8 @@ public: << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); }); - if (!AdjustReducedVals()) { + if (!AdjustReducedVals()) V.analyzedReductionVals(VL); - unsigned Offset = Pos == Start ? Pos : Pos - 1; - if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) { - // Add subvectors of VL to the list of the analyzed values. - for (unsigned VF = getFloorFullVectorNumberOfElements( - *TTI, VL.front()->getType(), ReduxWidth - 1); - VF >= ReductionLimit; - VF = getFloorFullVectorNumberOfElements( - *TTI, VL.front()->getType(), VF - 1)) { - if (has_single_bit(VF) && - V.getCanonicalGraphSize() != V.getTreeSize()) - continue; - for (unsigned Idx : seq(ReduxWidth - VF)) - IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF)); - } - } - } continue; } @@ -19478,9 +19349,7 @@ public: } Pos += ReduxWidth; Start = Pos; - ReduxWidth = NumReducedVals - Pos; - if (ReduxWidth > 1) - ReduxWidth = GetVectorFactor(NumReducedVals - Pos); + ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); AnyVectorized = true; } if (OptReusedScalars && !AnyVectorized) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 354791ddd6de..212177522409 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -43,32 +43,32 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP34]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] @@ -86,19 +86,19 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> ; CHECK-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]] ; CHECK-NEXT: [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]] -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]] ; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]] -; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]] ; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> ; CHECK-NEXT: [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], ; CHECK-NEXT: [[TMP70:%.*]] = and <16 x i32> [[TMP69]], ; CHECK-NEXT: [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index b16164c4e5ff..5f0b16048d40 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1215,26 +1215,26 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> @@ -1242,14 +1242,14 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> @@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> ; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index c431b058f0d2..fffa626cae0d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,17 +17,18 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 2191d04cd797..833bc56c4ec6 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POWER-OF-2-NEXT: entry: ; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) +; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> +; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> ; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POWER-OF-2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index 234b65803238..757d0b1708b6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -11,21 +11,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) ; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: entry.if.end72_crit_edge: ; CHECK-NEXT: br label [[IF_END72:%.*]] @@ -48,7 +46,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index c9ff2d6426d2..72e29839230e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -318,14 +318,22 @@ entry: define float @f(ptr nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] @@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) { ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] ; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index 0bc91d42b0f1..a7201e776fb4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) -; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4) -; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6) +; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) ; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) ; THRESH-NEXT: ret i32 [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll index 4898111960c0..47dd84c7f6e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll @@ -7,11 +7,10 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C]], align 8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2 @@ -19,7 +18,7 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0) ; CHECK-NEXT: [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll index 2623b7689f4f..93258f2975f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -8,6 +8,197 @@ ; into bb1, vectorizing all the way to the broadcast load at the top. ; The stores in bb1 are external to this tree, but they are vectorizable and are ; in reverse order. +define void @rotate_with_external_users(ptr %A, ptr %ptr) { +; CHECK-LABEL: @rotate_with_external_users( +; CHECK-NEXT: bb1: +; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8 +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]] +; CHECK-NEXT: ret void +; +bb1: + %ld = load double, ptr undef + + %add1 = fadd double %ld, 1.1 + %add2 = fadd double %ld, 2.2 + + %mul1 = fmul double %add1, 1.1 + %mul2 = fmul double %add2, 2.2 + + ; Thes are external vectorizable stores with operands in reverse order. + %ptrA2 = getelementptr inbounds double, ptr %A, i64 1 + store double %mul2, ptr %A + store double %mul1, ptr %ptrA2 + br label %bb2 + +bb2: + %add3 = fadd double %mul1, 3.3 + %add4 = fadd double %mul2, 4.4 + %seed = fcmp ogt double %add3, %add4 + ret void +} + +; This checks that non-consecutive external users are skipped. +define void @non_consecutive_external_users(ptr %A, ptr %ptr) { +; CHECK-LABEL: @non_consecutive_external_users( +; CHECK-NEXT: bb1: +; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], +; CHECK-NEXT: [[PTRA4:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 3 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3 +; CHECK-NEXT: store double [[TMP4]], ptr [[A]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2 +; CHECK-NEXT: store double [[TMP5]], ptr [[A]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1 +; CHECK-NEXT: store double [[TMP6]], ptr [[PTRA4]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0 +; CHECK-NEXT: store double [[TMP7]], ptr [[PTRA4]], align 8 +; CHECK-NEXT: br label [[SEED_LOOP:%.*]] +; CHECK: seed_loop: +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ] +; CHECK-NEXT: br label [[SEED_LOOP]] +; +bb1: + %ld = load double, ptr undef + + %add5 = fadd double %ld, 1.1 + %add6 = fadd double %ld, 2.2 + %add7 = fadd double %ld, 3.3 + %add8 = fadd double %ld, 4.4 + + %add1 = fadd double %add5, 1.1 + %add2 = fadd double %add6, 2.2 + %add3 = fadd double %add7, 3.3 + %add4 = fadd double %add8, 4.4 + + %mul1 = fmul double %add1, 1.1 + %mul2 = fmul double %add2, 2.2 + %mul3 = fmul double %add3, 3.3 + %mul4 = fmul double %add4, 4.4 + + ; External non-consecutive stores. + %ptrA4 = getelementptr inbounds double, ptr %A, i64 3 + store double %mul4, ptr %A + store double %mul3, ptr %A + store double %mul2, ptr %ptrA4 + store double %mul1, ptr %ptrA4 + br label %seed_loop + +seed_loop: + %phi1 = phi double [ %mul1, %bb1 ], [ 0.0, %seed_loop ] + %phi2 = phi double [ %mul2, %bb1 ], [ 0.0, %seed_loop ] + %phi3 = phi double [ %mul3, %bb1 ], [ 0.0, %seed_loop ] + %phi4 = phi double [ %mul4, %bb1 ], [ 0.0, %seed_loop ] + br label %seed_loop +} + +; We have to be careful when the tree contains add/sub patterns that could be +; combined into a single addsub instruction. Reordering can block the pattern. +define void @addsub_and_external_users(ptr %A, ptr %ptr) { +; CHECK-LABEL: @addsub_and_external_users( +; CHECK-NEXT: bb1: +; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: store <2 x double> [[SHUFFLE1]], ptr [[A:%.*]], align 8 +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] +; CHECK-NEXT: ret void +; +bb1: + %ld = load double, ptr undef + + %sub1 = fsub double %ld, 1.1 + %add2 = fadd double %ld, 1.2 + + %div1 = fdiv double %sub1, 2.1 + %div2 = fdiv double %add2, 2.2 + + %mul1 = fmul double %div1, 3.1 + %mul2 = fmul double %div2, 3.2 + + ; These are external vectorizable stores with operands in reverse order. + %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 + store double %mul2, ptr %A + store double %mul1, ptr %ptrA1 + br label %bb2 + +bb2: + %addS1 = fadd double %mul1, 4.1 + %addS2 = fadd double %mul2, 4.2 + %seed = fcmp ogt double %addS1, %addS2 + ret void +} + +; This contains a sub/add bundle, reordering it will make it better. +define void @subadd_and_external_users(ptr %A, ptr %ptr) { +; CHECK-LABEL: @subadd_and_external_users( +; CHECK-NEXT: bb1: +; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8 +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]] +; CHECK-NEXT: ret void +; +bb1: + %ld = load double, ptr undef + + %add1 = fadd double %ld, 1.1 + %sub2 = fsub double %ld, 1.2 + + %div1 = fdiv double %add1, 2.1 + %div2 = fdiv double %sub2, 2.2 + + %mul1 = fmul double %div1, 3.1 + %mul2 = fmul double %div2, 3.2 + + ; These are external vectorizable stores with operands in reverse order. + %ptrA1 = getelementptr inbounds double, ptr %A, i64 1 + store double %mul2, ptr %A + store double %mul1, ptr %ptrA1 + br label %bb2 + +bb2: + %addS1 = fadd double %mul1, 4.1 + %addS2 = fadd double %mul2, 4.2 + %seed = fcmp ogt double %addS1, %addS2 + ret void +} + define void @alt_but_not_addsub_and_external_users(ptr %A, ptr %ptr) { ; CHECK-LABEL: @alt_but_not_addsub_and_external_users( ; CHECK-NEXT: bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index a821362a883a..fd3c1a57aff3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) +; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> ; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POW2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index 9719e60a6a69..e1b091cc6fcd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] -- GitLab From 22e21bc1e796406c89e4a24fd81a1623ab2d7d85 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 21 Oct 2024 12:41:26 +0000 Subject: [PATCH 225/511] [llvm][llvm-lit] Correct description of --use-unique-output-file-name The initial version of this feature would use the output file name if it could, but in switching to temp files I forgot to replicate that behaviour. What happens now is we always use a tempfile name and the output path is a template for that. I think the current behaviour still makes sense so I'm just correcting the documentation. --- llvm/utils/lit/lit/cl_arguments.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py index c08c51b7b7a2..85744ded597c 100644 --- a/llvm/utils/lit/lit/cl_arguments.py +++ b/llvm/utils/lit/lit/cl_arguments.py @@ -177,11 +177,10 @@ def parse_args(): ) execution_group.add_argument( "--use-unique-output-file-name", - help="When enabled, lit will not overwrite existing test report files. " - "Instead it will write to a new file named the same as the output file " - "name but with an extra part before the file extension. For example " - "if results.xml already exists, results..xml will be written " - "to. The is not ordered in any way. [Default: Off]", + help="When enabled, lit will add a unique element to the output file name, " + 'before the extension. For example "results.xml" will become ' + '"results..xml". The "" is not ordered in any ' + "way and is chosen so that existing are not overwritten. [Default: Off]", action="store_true", ) execution_group.add_argument( -- GitLab From 6bac41496eb24c80aa659008d08220355a617c49 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Mon, 21 Oct 2024 08:49:13 -0400 Subject: [PATCH 226/511] [RISCV][GISEL] Legalize G_INSERT_SUBVECTOR (#108859) This code is heavily based on the SelectionDAG lowerINSERT_SUBVECTOR code. --- .../CodeGen/GlobalISel/GenericMachineInstrs.h | 12 + .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 2 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 100 +++ .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 142 +++- .../Target/RISCV/GISel/RISCVLegalizerInfo.h | 2 + llvm/lib/Target/RISCV/RISCVInstrGISel.td | 17 + .../rvv/legalize-insert-subvector.mir | 610 ++++++++++++++++++ 7 files changed, 881 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 92d37753791c..b6309a9ea0ec 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -811,6 +811,18 @@ public: } }; +/// Represents a insert subvector. +class GInsertSubvector : public GenericMachineInstr { +public: + Register getBigVec() const { return getOperand(1).getReg(); } + Register getSubVec() const { return getOperand(2).getReg(); } + uint64_t getIndexImm() const { return getOperand(3).getImm(); } + + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_INSERT_SUBVECTOR; + } +}; + /// Represents a freeze. class GFreeze : public GenericMachineInstr { public: diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index ec8a29938837..f682b20816d5 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -380,6 +380,8 @@ public: LLT CastTy); LegalizeResult bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx, LLT CastTy); + LegalizeResult bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy); LegalizeResult lowerConstant(MachineInstr &MI); LegalizeResult lowerFConstant(MachineInstr &MI); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 3b2fd95076c4..98aece0d68d6 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3276,6 +3276,33 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_INSERT_SUBVECTOR: { + if (TypeIdx != 0) + return UnableToLegalize; + + GInsertSubvector &IS = cast(MI); + Register BigVec = IS.getBigVec(); + Register SubVec = IS.getSubVec(); + + LLT SubVecTy = MRI.getType(SubVec); + LLT SubVecWideTy = SubVecTy.changeElementType(WideTy.getElementType()); + + // Widen the G_INSERT_SUBVECTOR + auto BigZExt = MIRBuilder.buildZExt(WideTy, BigVec); + auto SubZExt = MIRBuilder.buildZExt(SubVecWideTy, SubVec); + auto WideInsert = MIRBuilder.buildInsertSubvector(WideTy, BigZExt, SubZExt, + IS.getIndexImm()); + + // Truncate back down + auto SplatZero = MIRBuilder.buildSplatVector( + WideTy, MIRBuilder.buildConstant(WideTy.getElementType(), 0)); + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, IS.getReg(0), WideInsert, + SplatZero); + + MI.eraseFromParent(); + + return Legalized; + } } } @@ -3725,6 +3752,77 @@ LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx, return Legalized; } +/// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy. +/// +/// = G_INSERT_SUBVECTOR , +/// , +/// N +/// +/// ===> +/// +/// = G_BITCAST +/// = G_BITCAST +/// = G_INSERT_SUBVECTOR , +/// , N / 8 +/// = G_BITCAST +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + auto ES = cast(&MI); + + if (!CastTy.isVector()) + return UnableToLegalize; + + if (TypeIdx != 0) + return UnableToLegalize; + + Register Dst = ES->getReg(0); + Register BigVec = ES->getBigVec(); + Register SubVec = ES->getSubVec(); + uint64_t Idx = ES->getIndexImm(); + + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + + LLT DstTy = MRI.getType(Dst); + LLT BigVecTy = MRI.getType(BigVec); + LLT SubVecTy = MRI.getType(SubVec); + + if (DstTy == CastTy) + return Legalized; + + if (DstTy.getSizeInBits() != CastTy.getSizeInBits()) + return UnableToLegalize; + + ElementCount DstTyEC = DstTy.getElementCount(); + ElementCount BigVecTyEC = BigVecTy.getElementCount(); + ElementCount SubVecTyEC = SubVecTy.getElementCount(); + auto DstTyMinElts = DstTyEC.getKnownMinValue(); + auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue(); + auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue(); + + unsigned CastEltSize = CastTy.getElementType().getSizeInBits(); + unsigned DstEltSize = DstTy.getElementType().getSizeInBits(); + if (CastEltSize < DstEltSize) + return UnableToLegalize; + + auto AdjustAmt = CastEltSize / DstEltSize; + if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 || + BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0) + return UnableToLegalize; + + Idx /= AdjustAmt; + BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt); + SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt); + auto CastBigVec = MIRBuilder.buildBitcast(BigVecTy, BigVec); + auto CastSubVec = MIRBuilder.buildBitcast(SubVecTy, SubVec); + auto PromotedIS = + MIRBuilder.buildInsertSubvector(CastTy, CastBigVec, CastSubVec, Idx); + MIRBuilder.buildBitcast(Dst, PromotedIS); + + ES->eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT Register DstReg = LoadMI.getDstReg(); @@ -4033,6 +4131,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { return bitcastConcatVector(MI, TypeIdx, CastTy); case TargetOpcode::G_EXTRACT_SUBVECTOR: return bitcastExtractSubvector(MI, TypeIdx, CastTy); + case TargetOpcode::G_INSERT_SUBVECTOR: + return bitcastInsertSubvector(MI, TypeIdx, CastTy); default: return UnableToLegalize; } diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index c06ab061ddc3..91f0a25328e7 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -615,6 +615,12 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)))); + getActionDefinitionsBuilder(G_INSERT_SUBVECTOR) + .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), + typeIsLegalBoolVec(1, BoolVecTys, ST))) + .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), + typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))); + getLegacyLegalizerInfo().computeTables(); } @@ -834,9 +840,7 @@ static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL, /// Gets the two common "VL" operands: an all-ones mask and the vector length. /// VecTy is a scalable vector type. static std::pair -buildDefaultVLOps(const DstOp &Dst, MachineIRBuilder &MIB, - MachineRegisterInfo &MRI) { - LLT VecTy = Dst.getLLTTy(MRI); +buildDefaultVLOps(LLT VecTy, MachineIRBuilder &MIB, MachineRegisterInfo &MRI) { assert(VecTy.isScalableVector() && "Expecting scalable container type"); const RISCVSubtarget &STI = MIB.getMF().getSubtarget(); LLT XLenTy(STI.getXLenVT()); @@ -890,7 +894,7 @@ bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI, // Handle case of s64 element vectors on rv32 if (XLenTy.getSizeInBits() == 32 && VecTy.getElementType().getSizeInBits() == 64) { - auto [_, VL] = buildDefaultVLOps(Dst, MIB, MRI); + auto [_, VL] = buildDefaultVLOps(MRI.getType(Dst), MIB, MRI); buildSplatSplitS64WithVL(Dst, MIB.buildUndef(VecTy), SplatVal, VL, MIB, MRI); MI.eraseFromParent(); @@ -1025,6 +1029,134 @@ bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI, return true; } +bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI, + LegalizerHelper &Helper, + MachineIRBuilder &MIB) const { + GInsertSubvector &IS = cast(MI); + + MachineRegisterInfo &MRI = *MIB.getMRI(); + + Register Dst = IS.getReg(0); + Register BigVec = IS.getBigVec(); + Register LitVec = IS.getSubVec(); + uint64_t Idx = IS.getIndexImm(); + + LLT BigTy = MRI.getType(BigVec); + LLT LitTy = MRI.getType(LitVec); + + if (Idx == 0 || + MRI.getVRegDef(BigVec)->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) + return true; + + // We don't have the ability to slide mask vectors up indexed by their i1 + // elements; the smallest we can do is i8. Often we are able to bitcast to + // equivalent i8 vectors. Otherwise, we can must zeroextend to equivalent i8 + // vectors and truncate down after the insert. + if (LitTy.getElementType() == LLT::scalar(1)) { + auto BigTyMinElts = BigTy.getElementCount().getKnownMinValue(); + auto LitTyMinElts = LitTy.getElementCount().getKnownMinValue(); + if (BigTyMinElts >= 8 && LitTyMinElts >= 8) + return Helper.bitcast( + IS, 0, + LLT::vector(BigTy.getElementCount().divideCoefficientBy(8), 8)); + + // We can't slide this mask vector up indexed by its i1 elements. + // This poses a problem when we wish to insert a scalable vector which + // can't be re-expressed as a larger type. Just choose the slow path and + // extend to a larger type, then truncate back down. + LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8)); + return Helper.widenScalar(IS, 0, ExtBigTy); + } + + const RISCVRegisterInfo *TRI = STI.getRegisterInfo(); + unsigned SubRegIdx, RemIdx; + std::tie(SubRegIdx, RemIdx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + getMVTForLLT(BigTy), getMVTForLLT(LitTy), Idx, TRI); + + TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock); + assert(isPowerOf2_64( + STI.expandVScale(LitTy.getSizeInBits()).getKnownMinValue())); + bool ExactlyVecRegSized = + STI.expandVScale(LitTy.getSizeInBits()) + .isKnownMultipleOf(STI.expandVScale(VecRegSize)); + + // If the Idx has been completely eliminated and this subvector's size is a + // vector register or a multiple thereof, or the surrounding elements are + // undef, then this is a subvector insert which naturally aligns to a vector + // register. These can easily be handled using subregister manipulation. + if (RemIdx == 0 && ExactlyVecRegSized) + return true; + + // If the subvector is smaller than a vector register, then the insertion + // must preserve the undisturbed elements of the register. We do this by + // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type + // (which resolves to a subregister copy), performing a VSLIDEUP to place the + // subvector within the vector register, and an INSERT_SUBVECTOR of that + // LMUL=1 type back into the larger vector (resolving to another subregister + // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type + // to avoid allocating a large register group to hold our subvector. + + // VSLIDEUP works by leaving elements 0; +// Pseudo equivalent to a RISCVISD::VMV_V_V_VL +def G_VMV_V_V_VL : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vec, type2:$vl); + let hasSideEffects = false; +} +def : GINodeEquiv; + +// Pseudo equivalent to a RISCVISD::VSLIDEUP_VL +def G_VSLIDEUP_VL : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$merge, type0:$vec, type1:$idx, type2:$mask, + type3:$vl, type4:$policy); + let hasSideEffects = false; +} +def : GINodeEquiv; + diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir new file mode 100644 index 000000000000..68c5ae120474 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir @@ -0,0 +1,610 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,RV32 +# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,RV64 + + +# BigVec=G_IMPLICIT_DEF when index is non-zero +--- +name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero + ; CHECK: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](), 2 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = G_IMPLICIT_DEF + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 2 + $v8 = COPY %2() + PseudoRET implicit $v8 +... + +# BigVec=G_IMPLICIT_DEF when index is zero +--- +name: insert_subvector_nxv2i1_nxv4i1_undef_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_zero + ; CHECK: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = G_IMPLICIT_DEF + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... + + +# Special handling for i1-element vectors with non-zero index +--- +name: insert_subvector_nxv2i1_nxv4i1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1 + ; RV32: liveins: $v8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT]](s64) + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT1]](s64) + ; RV32-NEXT: [[SELECT:%[0-9]+]]:_() = G_SELECT [[COPY]](), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]] + ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT2]](s64) + ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT3]](s64) + ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_() = G_SELECT [[DEF]](), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]] + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](), 0 + ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C4]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64) + ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64) + ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]] + ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](), [[ADD]](s64), 1 + ; RV32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT4]](s64) + ; RV32-NEXT: [[ICMP:%[0-9]+]]:_() = G_ICMP intpred(ne), [[VSLIDEUP_VL]](), [[SPLAT_VECTOR4]] + ; RV32-NEXT: $v8 = COPY [[ICMP]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1 + ; RV64: liveins: $v8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C]](s32) + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C1]](s32) + ; RV64-NEXT: [[SELECT:%[0-9]+]]:_() = G_SELECT [[COPY]](), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]] + ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C2]](s32) + ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C3]](s32) + ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_() = G_SELECT [[DEF]](), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]] + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](), 0 + ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C4]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32) + ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32) + ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]] + ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](), [[ADD]](s32), 1 + ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C6]](s32) + ; RV64-NEXT: [[ICMP:%[0-9]+]]:_() = G_ICMP intpred(ne), [[VSLIDEUP_VL]](), [[SPLAT_VECTOR4]] + ; RV64-NEXT: $v8 = COPY [[ICMP]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 2 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv4i1_nxv8i1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1 + ; RV32: liveins: $v8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT]](s64) + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT1]](s64) + ; RV32-NEXT: [[SELECT:%[0-9]+]]:_() = G_SELECT [[COPY]](), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]] + ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT2]](s64) + ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT3]](s64) + ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_() = G_SELECT [[DEF]](), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]] + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](), 0 + ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C4]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64) + ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C6]](s64) + ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]] + ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](), [[ADD]](s64), 0 + ; RV32-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C7]](s32) + ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_() = G_SPLAT_VECTOR [[ANYEXT4]](s64) + ; RV32-NEXT: [[ICMP:%[0-9]+]]:_() = G_ICMP intpred(ne), [[VSLIDEUP_VL]](), [[SPLAT_VECTOR4]] + ; RV32-NEXT: $v8 = COPY [[ICMP]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1 + ; RV64: liveins: $v8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C]](s32) + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C1]](s32) + ; RV64-NEXT: [[SELECT:%[0-9]+]]:_() = G_SELECT [[COPY]](), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]] + ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C2]](s32) + ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C3]](s32) + ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_() = G_SELECT [[DEF]](), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]] + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](), 0 + ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C4]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32) + ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C6]](s32) + ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]] + ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](), [[ADD]](s32), 0 + ; RV64-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_() = G_SPLAT_VECTOR [[C7]](s32) + ; RV64-NEXT: [[ICMP:%[0-9]+]]:_() = G_ICMP intpred(ne), [[VSLIDEUP_VL]](), [[SPLAT_VECTOR4]] + ; RV64-NEXT: $v8 = COPY [[ICMP]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 4 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv32i1_nxv64i1 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1 + ; RV32: liveins: $v8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_() = G_BITCAST [[COPY]]() + ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_() = G_BITCAST [[DEF]]() + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](), 0 + ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64) + ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64) + ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]] + ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](), [[ADD]](s64), 1 + ; RV32-NEXT: [[BITCAST2:%[0-9]+]]:_() = G_BITCAST [[VSLIDEUP_VL]]() + ; RV32-NEXT: $v8 = COPY [[BITCAST2]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1 + ; RV64: liveins: $v8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_() = G_BITCAST [[COPY]]() + ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_() = G_BITCAST [[DEF]]() + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](), 0 + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32) + ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32) + ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]] + ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](), [[ADD]](s32), 1 + ; RV64-NEXT: [[BITCAST2:%[0-9]+]]:_() = G_BITCAST [[VSLIDEUP_VL]]() + ; RV64-NEXT: $v8 = COPY [[BITCAST2]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 32 + $v8 = COPY %2() + PseudoRET implicit $v8 +... + +# i1-element vectors with zero index +--- +name: insert_subvector_nxv2i1_nxv4i1_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv4i1_nxv8i1_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv32i1_nxv64i1_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... + +# Insert with zero index +--- +name: insert_subvector_nxv1i8_nxv2i8_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv2i16_nxv4i16_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv4i32_nxv8i32_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8m4 + ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32_zero + ; CHECK: liveins: $v8m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8m4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8m4 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv2i64_nxv8i64_zero +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8m8 + ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64_zero + ; CHECK: liveins: $v8m8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8m8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 0 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8m8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 0 + $v8 = COPY %2() + PseudoRET implicit $v8 +... + +# Insert with non-zero index +--- +name: insert_subvector_nxv1i8_nxv2i8 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8 + ; RV32: liveins: $v8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64) + ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64) + ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]] + ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](), [[ADD]](s64), 1 + ; RV32-NEXT: $v8 = COPY [[VSLIDEUP_VL]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8 + ; RV64: liveins: $v8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32) + ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32) + ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]] + ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](), [[ADD]](s32), 1 + ; RV64-NEXT: $v8 = COPY [[VSLIDEUP_VL]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 1 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv2i16_nxv4i16 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8 + ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16 + ; RV32: liveins: $v8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64) + ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64) + ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]] + ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](), [[ADD]](s64), 0 + ; RV32-NEXT: $v8 = COPY [[VSLIDEUP_VL]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16 + ; RV64: liveins: $v8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32) + ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32) + ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]] + ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_() = G_VSLIDEUP_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](), [[ADD]](s32), 0 + ; RV64-NEXT: $v8 = COPY [[VSLIDEUP_VL]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 1 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv8i16_nxv1i16 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8m8 + ; RV32-LABEL: name: insert_subvector_nxv8i16_nxv1i16 + ; RV32: liveins: $v8m8 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV32-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_() = G_EXTRACT_SUBVECTOR [[COPY]](), 4 + ; RV32-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s64) + ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB + ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64) + ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]](), [[LSHR]](s64) + ; RV32-NEXT: [[INSERT_SUBVECTOR1:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[VMV_V_V_VL]](), 4 + ; RV32-NEXT: $v8 = COPY [[INSERT_SUBVECTOR1]]() + ; RV32-NEXT: PseudoRET implicit $v8 + ; + ; RV64-LABEL: name: insert_subvector_nxv8i16_nxv1i16 + ; RV64: liveins: $v8m8 + ; RV64-NEXT: {{ $}} + ; RV64-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8 + ; RV64-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_() = G_EXTRACT_SUBVECTOR [[COPY]](), 4 + ; RV64-NEXT: [[DEF1:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](), 0 + ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_() = G_VMSET_VL [[C]](s32) + ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB + ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32) + ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]](), [[LSHR]](s32) + ; RV64-NEXT: [[INSERT_SUBVECTOR1:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[VMV_V_V_VL]](), 4 + ; RV64-NEXT: $v8 = COPY [[INSERT_SUBVECTOR1]]() + ; RV64-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 4 + $v8 = COPY %2() + PseudoRET implicit $v8 +... +--- +name: insert_subvector_nxv4i32_nxv8i32 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8m4 + ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32 + ; CHECK: liveins: $v8m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8m4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 4 + ; CHECK-NEXT: $v8m4 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8m4 + %0:_() = COPY $v8m4 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 4 + $v8m4 = COPY %2() + PseudoRET implicit $v8m4 +... +--- +name: insert_subvector_nxv2i64_nxv8i64 +legalized: false +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $v8m8 + ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64 + ; CHECK: liveins: $v8m8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $v8m8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_() = G_IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](), 4 + ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]]() + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:_() = COPY $v8m8 + %1:_() = G_IMPLICIT_DEF + %2:_() = G_INSERT_SUBVECTOR %0(), %1, 4 + $v8 = COPY %2() + PseudoRET implicit $v8 +... -- GitLab From 08330dba923c6293b71c85a9f27153c630adc968 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 21 Oct 2024 12:54:44 +0000 Subject: [PATCH 227/511] [llvm][llvm-lit] Fix missing word in --use-unique-output-file-name help Fixes 22e21bc1e796406c89e4a24fd81a1623ab2d7d85. --- llvm/utils/lit/lit/cl_arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py index 85744ded597c..3e5488f388cc 100644 --- a/llvm/utils/lit/lit/cl_arguments.py +++ b/llvm/utils/lit/lit/cl_arguments.py @@ -180,7 +180,7 @@ def parse_args(): help="When enabled, lit will add a unique element to the output file name, " 'before the extension. For example "results.xml" will become ' '"results..xml". The "" is not ordered in any ' - "way and is chosen so that existing are not overwritten. [Default: Off]", + "way and is chosen so that existing files are not overwritten. [Default: Off]", action="store_true", ) execution_group.add_argument( -- GitLab From e26d9070d3eaee587b3ef0da6d12200a5b994765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Mon, 21 Oct 2024 15:00:32 +0200 Subject: [PATCH 228/511] [Reg2Mem] Add legacy pass wrapping Reg2Mem (#111024) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SPIR-V backend will need to use Reg2Mem, hence this pass needs to be wrapped to be used with the legacy pass manager. --------- Signed-off-by: Nathan Gauër --- llvm/include/llvm/InitializePasses.h | 1 + llvm/include/llvm/LinkAllPasses.h | 1 + llvm/include/llvm/Transforms/Utils.h | 8 +++++ llvm/lib/Transforms/Scalar/Reg2Mem.cpp | 43 ++++++++++++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 1374880b6a71..e50cb0dd7541 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -256,6 +256,7 @@ void initializeRegAllocFastPass(PassRegistry &); void initializeRegAllocPriorityAdvisorAnalysisPass(PassRegistry &); void initializeRegAllocScoringPass(PassRegistry &); void initializeRegBankSelectPass(PassRegistry &); +void initializeRegToMemWrapperPassPass(PassRegistry &); void initializeRegUsageInfoCollectorPass(PassRegistry &); void initializeRegUsageInfoPropagationPass(PassRegistry &); void initializeRegionInfoPassPass(PassRegistry &); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 92b59a66567c..3516b47d29ef 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -98,6 +98,7 @@ struct ForcePassLinking { (void)llvm::createNaryReassociatePass(); (void)llvm::createObjCARCContractPass(); (void)llvm::createPromoteMemoryToRegisterPass(); + (void)llvm::createRegToMemWrapperPass(); (void)llvm::createPostDomOnlyPrinterWrapperPassPass(); (void)llvm::createPostDomPrinterWrapperPassPass(); (void)llvm::createPostDomOnlyViewerWrapperPassPass(); diff --git a/llvm/include/llvm/Transforms/Utils.h b/llvm/include/llvm/Transforms/Utils.h index 677cc3d128c3..ff306dbe3580 100644 --- a/llvm/include/llvm/Transforms/Utils.h +++ b/llvm/include/llvm/Transforms/Utils.h @@ -81,6 +81,14 @@ extern char &LCSSAID; // FunctionPass *createPromoteMemoryToRegisterPass(); +//===----------------------------------------------------------------------===// +// +// RegToMemWrapperPass - This pass is used to demote registers to memory +// references. In basically undoes the PromoteMemoryToRegister pass to make cfg +// hacking easier. +// +FunctionPass *createRegToMemWrapperPass(); + //===----------------------------------------------------------------------===// // // LoopSimplify - Insert Pre-header blocks into the CFG for every function in diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index ebc5075aa36f..30b27cb19b4a 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -105,3 +106,45 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve(); return PA; } + +namespace llvm { + +void initializeRegToMemWrapperPassPass(PassRegistry &); + +class RegToMemWrapperPass : public FunctionPass { +public: + static char ID; + + RegToMemWrapperPass() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + + AU.addPreserved(); + AU.addRequired(); + + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + DominatorTree *DT = &getAnalysis().getDomTree(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + + unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI)); + bool Changed = runPass(F); + return N != 0 || Changed; + } +}; +} // namespace llvm + +INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_END(RegToMemWrapperPass, "reg2mem", "", true, true) + +char RegToMemWrapperPass::ID = 0; + +FunctionPass *llvm::createRegToMemWrapperPass() { + return new RegToMemWrapperPass(); +} -- GitLab From 89d8449a2900123c2e9bd7a11315381b2b70c155 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Mon, 21 Oct 2024 15:12:53 +0200 Subject: [PATCH 229/511] [ORC] Fix LLJIT's __cxa_atexit declaration for clang-repl. (#113141) Add sign extension on i32 return value. --- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 401ed525fd5c..db39fec12e5f 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -477,12 +477,16 @@ private: auto *CxaAtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false); auto *CxaAtExitCallbackPtrTy = PointerType::getUnqual(CxaAtExitCallbackTy); - addHelperAndWrapper( + auto *CxaAtExit = addHelperAndWrapper( *M, "__cxa_atexit", FunctionType::get(IntTy, {CxaAtExitCallbackPtrTy, BytePtrTy, BytePtrTy}, false), GlobalValue::DefaultVisibility, "__lljit.cxa_atexit_helper", {PlatformInstanceDecl}); + Attribute::AttrKind CxaAtExitExtAttr = + TargetLibraryInfo::getExtAttrForI32Return(J.getTargetTriple()); + if (CxaAtExitExtAttr != Attribute::None) + CxaAtExit->addRetAttr(CxaAtExitExtAttr); return ThreadSafeModule(std::move(M), std::move(Ctx)); } -- GitLab From ecfeacd152f07cf8aea210f63415e3e48b05ab22 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 21 Oct 2024 14:29:21 +0100 Subject: [PATCH 230/511] [AArch64] Convert aarch64_neon_sqxtn to ISD::TRUNCATE_SSAT_S and replace tablegen patterns This lowers the aarch64_neon_sqxtn intrinsics to the new TRUNCATE_SSAT_S ISD nodes, performing the same for sqxtun and uqxtn. This allows us to clean up the tablegen patterns a little and in a future commit add combines for sqxtn. --- .../Target/AArch64/AArch64ISelLowering.cpp | 9 ++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 85 +------------------ 2 files changed, 12 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7448416c682a..03e8885b92f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5941,6 +5941,15 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1)))); return SDValue(); } + case Intrinsic::aarch64_neon_sqxtn: + return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_neon_sqxtun: + return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_neon_uqxtn: + return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), + Op.getOperand(1)); case Intrinsic::aarch64_sve_whilelo: return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/false); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c9f0986b9e3..37dd43a203e5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5430,14 +5430,14 @@ defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>; defm SHLL : SIMDVectorLShiftLongBySizeBHS; defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; -defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>; -defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>; +defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", truncssat_s>; +defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", truncssat_u>; defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>; defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >; defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>; defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>; -defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; +defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", truncusat_u>; defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>; defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; @@ -5476,85 +5476,6 @@ defm : SIMDVectorLShiftLongBySizeBHSPats; defm : SIMDVectorLShiftLongBySizeBHSPats; defm : SIMDVectorLShiftLongBySizeBHSPats; -// Constant vector values, used in the S/UQXTN patterns below. -def VImmFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 85))))>; -def VImmFFFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 51))))>; -def VImm7F: PatLeaf<(AArch64movi_shift (i32 127), (i32 0))>; -def VImm80: PatLeaf<(AArch64mvni_shift (i32 127), (i32 0))>; -def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>; -def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>; - -// trunc(umin(X, 255)) -> UQXTRN v8i8 -def : Pat<(v8i8 (truncusat_u (v8i16 V128:$Vn))), - (UQXTNv8i8 V128:$Vn)>; -// trunc(umin(X, 65535)) -> UQXTRN v4i16 -def : Pat<(v4i16 (truncusat_u (v4i32 V128:$Vn))), - (UQXTNv4i16 V128:$Vn)>; -// trunc(umin(X, 4294967295)) -> UQXTRN v2i32 -def : Pat<(v2i32 (truncusat_u (v2i64 V128:$Vn))), - (UQXTNv2i32 V128:$Vn)>; -// trunc(smin(smax(X, -128), 128)) -> SQXTRN -def : Pat<(v8i8 (truncssat_s (v8i16 V128:$Vn))), - (SQXTNv8i8 V128:$Vn)>; -// trunc(smin(smax(X, -32768), 32767)) -> SQXTRN -def : Pat<(v4i16 (truncssat_s (v4i32 V128:$Vn))), - (SQXTNv4i16 V128:$Vn)>; -// trunc(smin(smax(X, -2147483648), 2147483647)) -> SQXTRN -def : Pat<(v2i32 (truncssat_s (v2i64 V128:$Vn))), - (SQXTNv2i32 V128:$Vn)>; -// trunc(umin(smax(X, 0), 255)) -> SQXTUN -def : Pat<(v8i8 (truncssat_u (v8i16 V128:$Vn))), - (SQXTUNv8i8 V128:$Vn)>; -// trunc(umin(smax(X, 0), 65535)) -> SQXTUN -def : Pat<(v4i16 (truncssat_u (v4i32 V128:$Vn))), - (SQXTUNv4i16 V128:$Vn)>; -// trunc(umin(smax(X, 0), 4294967295)) -> SQXTUN -def : Pat<(v2i32 (truncssat_u (v2i64 V128:$Vn))), - (SQXTUNv2i32 V128:$Vn)>; - -// truncusat_u -// concat_vectors(Vd, truncusat_u(Vn)) ~> UQXTRN(Vd, Vn) -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (truncusat_u (v8i16 V128:$Vn))))), - (UQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (truncusat_u (v4i32 V128:$Vn))))), - (UQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v4i32 (concat_vectors - (v2i32 V64:$Vd), - (v2i32 (truncusat_u (v2i64 V128:$Vn))))), - (UQXTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - -// concat_vectors(Vd, truncssat_s(Vn)) ~> SQXTN2(Vd, Vn) -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (truncssat_s (v8i16 V128:$Vn))))), - (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (truncssat_s (v4i32 V128:$Vn))))), - (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v4i32 (concat_vectors - (v2i32 V64:$Vd), - (v2i32 (truncssat_s (v2i64 V128:$Vn))))), - (SQXTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - -// concat_vectors(Vd, truncssat_u(Vn)) ~> SQXTUN2(Vd, Vn) -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (truncssat_u (v8i16 V128:$Vn))))), - (SQXTUNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (truncssat_u (v4i32 V128:$Vn))))), - (SQXTUNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v4i32 (concat_vectors - (v2i32 V64:$Vd), - (v2i32 (truncssat_u (v2i64 V128:$Vn))))), - (SQXTUNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - // Select BSWAP vector instructions into REV instructions def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))), (v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>; -- GitLab From c44860c8d2582abd88794267b4fa0fa953bbef80 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Mon, 21 Oct 2024 14:32:21 +0100 Subject: [PATCH 231/511] [Flang][OpenMP] Disable lowering of omp.simd reductions in composites (#112686) Currently, the `omp.simd` operation is ignored during MLIR to LLVM IR translation when it takes part in a composite construct. One consequence of this limitation is that any entry block arguments defined by that operation will trigger a compiler crash if they are used anywhere, as they are not bound to an LLVM IR value. A previous PR introducing support for the `reduction` clause resulted in the creation and use of entry block arguments attached to the `omp.simd` operation, causing compiler crashes on 'do simd reduction(...)' constructs. This patch disables Flang lowering of simd reductions in 'do simd' constructs to avoid triggering these errors while translation to LLVM IR is still incomplete. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index cf469003b729..52a077cd5a79 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -2209,6 +2209,12 @@ static void genCompositeDistributeParallelDoSimd( genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms); + // TODO: Remove this after omp.simd reductions on composite constructs are + // supported. + simdClauseOps.reductionVars.clear(); + simdClauseOps.reductionByref.clear(); + simdClauseOps.reductionSyms.clear(); + mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc, @@ -2230,9 +2236,7 @@ static void genCompositeDistributeParallelDoSimd( wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private syms and vars. - simdArgs.reduction.syms = simdReductionSyms; - simdArgs.reduction.vars = simdClauseOps.reductionVars; + // TODO: Add private and reduction syms and vars. auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); @@ -2325,6 +2329,12 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps, simdReductionSyms); + // TODO: Remove this after omp.simd reductions on composite constructs are + // supported. + simdClauseOps.reductionVars.clear(); + simdClauseOps.reductionByref.clear(); + simdClauseOps.reductionSyms.clear(); + // TODO: Support delayed privatization. DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/true, @@ -2348,9 +2358,7 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, wsloopOp.setComposite(/*val=*/true); EntryBlockArgs simdArgs; - // TODO: Add private syms and vars. - simdArgs.reduction.syms = simdReductionSyms; - simdArgs.reduction.vars = simdClauseOps.reductionVars; + // TODO: Add private and reduction syms and vars. auto simdOp = genWrapperOp(converter, loc, simdClauseOps, simdArgs); simdOp.setComposite(/*val=*/true); -- GitLab From 5c9c281c251402fd65bb01717112cf22019ee409 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 06:50:03 -0700 Subject: [PATCH 232/511] [DebugInfo] Use heterogenous lookups with std::map (NFC) (#113118) --- .../llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h | 2 +- llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h index f76f2ecd3e21..9cda64e33ddf 100644 --- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h +++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h @@ -47,7 +47,7 @@ struct LVSymbolTableEntry final { // Function names extracted from the object symbol table. class LVSymbolTable final { - using LVSymbolNames = std::map; + using LVSymbolNames = std::map>; LVSymbolNames SymbolNames; public: diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp index c45f0e91c435..932346e1b011 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp @@ -87,20 +87,20 @@ LVSectionIndex LVSymbolTable::update(LVScope *Function) { const LVSymbolTableEntry &LVSymbolTable::getEntry(StringRef Name) { static LVSymbolTableEntry Empty = LVSymbolTableEntry(); - LVSymbolNames::iterator Iter = SymbolNames.find(std::string(Name)); + LVSymbolNames::iterator Iter = SymbolNames.find(Name); return Iter != SymbolNames.end() ? Iter->second : Empty; } LVAddress LVSymbolTable::getAddress(StringRef Name) { - LVSymbolNames::iterator Iter = SymbolNames.find(std::string(Name)); + LVSymbolNames::iterator Iter = SymbolNames.find(Name); return Iter != SymbolNames.end() ? Iter->second.Address : 0; } LVSectionIndex LVSymbolTable::getIndex(StringRef Name) { - LVSymbolNames::iterator Iter = SymbolNames.find(std::string(Name)); + LVSymbolNames::iterator Iter = SymbolNames.find(Name); return Iter != SymbolNames.end() ? Iter->second.SectionIndex : getReader().getDotTextSectionIndex(); } bool LVSymbolTable::getIsComdat(StringRef Name) { - LVSymbolNames::iterator Iter = SymbolNames.find(std::string(Name)); + LVSymbolNames::iterator Iter = SymbolNames.find(Name); return Iter != SymbolNames.end() ? Iter->second.IsComdat : false; } -- GitLab From 61a286ac0817671ad09a505303b7a3a446798316 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 06:50:34 -0700 Subject: [PATCH 233/511] [tools] Don't call StringRef::str() when calling StringMap::find (NFC) (#113119) StringMap::find takes StringRef. We don't need to create an instance of std::string from StringRef only to convert it right back to StringRef. --- llvm/tools/llvm-profdata/llvm-profdata.cpp | 4 ++-- llvm/tools/llvm-readtapi/llvm-readtapi.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index c235c3f2b105..59f0f1f1fae8 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -1297,7 +1297,7 @@ adjustInstrProfile(std::unique_ptr &WC, } else { auto NewName = StaticFuncMap.find(Name); if (NewName != StaticFuncMap.end()) { - It = InstrProfileMap.find(NewName->second.str()); + It = InstrProfileMap.find(NewName->second); if (NewName->second != DuplicateNameStr) { NewRootName = &NewName->second; } @@ -1382,7 +1382,7 @@ adjustInstrProfile(std::unique_ptr &WC, if (It == InstrProfileMap.end()) { auto NewName = StaticFuncMap.find(Name); if (NewName != StaticFuncMap.end()) { - It = InstrProfileMap.find(NewName->second.str()); + It = InstrProfileMap.find(NewName->second); if (NewName->second == DuplicateNameStr) { WithColor::warning() << "Static function " << Name diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp index 1d740109d5b6..7390d0ec4b79 100644 --- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp +++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp @@ -382,7 +382,7 @@ static void stubifyDirectory(const StringRef InputPath, Context &Ctx) { // libraries to stubify. StringRef LibToCheck = Found->second; for (int i = 0; i < 20; ++i) { - auto LinkIt = SymLinks.find(LibToCheck.str()); + auto LinkIt = SymLinks.find(LibToCheck); if (LinkIt != SymLinks.end()) { for (auto &SymInfo : LinkIt->second) { SmallString LinkSrc(SymInfo.SrcPath); -- GitLab From 1bf1e92c72ec9086ab24103cf968e115b7248101 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 06:51:25 -0700 Subject: [PATCH 234/511] [lldb] Avoid repeated map lookups (NFC) (#113121) --- lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp index 584c2115459c..4fc48b4d1333 100644 --- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp @@ -1295,12 +1295,11 @@ void SymbolFilePDB::CacheFunctionNames() { continue; if (CPlusPlusLanguage::IsCPPMangledName(name.c_str())) { - auto vm_addr = pub_sym_up->getVirtualAddress(); - // PDB public symbol has mangled name for its associated function. - if (vm_addr && addr_ids.find(vm_addr) != addr_ids.end()) { - // Cache mangled name. - m_func_full_names.Append(ConstString(name), addr_ids[vm_addr]); + if (auto vm_addr = pub_sym_up->getVirtualAddress()) { + if (auto it = addr_ids.find(vm_addr); it != addr_ids.end()) + // Cache mangled name. + m_func_full_names.Append(ConstString(name), it->second); } } } -- GitLab From af6e1881e0791ac1ee611b62a3d12d9fb03ca142 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 06:52:24 -0700 Subject: [PATCH 235/511] [mlir] Avoid repeated map lookups (NFC) (#113122) --- .../Dialect/SparseTensor/IR/SparseTensorDialect.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 32e1dcbc2cce..9854cfcc279b 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -1142,16 +1142,18 @@ bool mlir::sparse_tensor::isBlockSparsity(AffineMap dimToLvl) { auto pos = dimOp.getPosition(); if (binOp.getKind() == AffineExprKind::FloorDiv) { // Expect only one floordiv for each dimension. - if (coeffientMap.find(pos) != coeffientMap.end()) + auto [it, inserted] = coeffientMap.try_emplace(pos); + if (!inserted) return false; // Record coefficient of the floordiv. - coeffientMap[pos] = conOp.getValue(); + it->second = conOp.getValue(); } else if (binOp.getKind() == AffineExprKind::Mod) { // Expect floordiv before mod. - if (coeffientMap.find(pos) == coeffientMap.end()) + auto it = coeffientMap.find(pos); + if (it == coeffientMap.end()) return false; // Expect mod to have the same coefficient as floordiv. - if (conOp.getValue() != coeffientMap[pos]) + if (conOp.getValue() != it->second) return false; hasBlock = true; } else { -- GitLab From e2074c60bb3982cd8afb6408670332ea27da6383 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 21 Oct 2024 15:52:21 +0200 Subject: [PATCH 236/511] [AArch64] Use implicitTrunc in isBitfieldDstMask() (NFC) This code intentionally discards the high bits, so set implicitTrunc=true. This is currently NFC but will enable an APInt assertion in the future. --- .../lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 4 +++- llvm/test/CodeGen/AArch64/bitfield-insert.ll | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6133580a3cd7..2120443b6ba2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2792,7 +2792,9 @@ static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted, "i32 or i64 mask type expected!"); unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits; - APInt SignificantDstMask = APInt(BitWidth, DstMask); + // Enable implicitTrunc as we're intentionally ignoring high bits. + APInt SignificantDstMask = + APInt(BitWidth, DstMask, /*isSigned=*/false, /*implicitTrunc=*/true); APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth); return (SignificantDstMask & SignificantBitsToBeInserted) == 0 && diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll index 14a594e8028d..eefb862c5313 100644 --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -735,3 +735,21 @@ define i32 @orr_not_bfxil_test2_i32(i32 %0) { %4 = or i32 %2, %3 ret i32 %4 } + +define i16 @implicit_trunc_of_imm(ptr %p, i16 %a, i16 %b) { +; CHECK-LABEL: implicit_trunc_of_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w1, #0xffffe000 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w10, w8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: bfxil w10, w2, #0, #1 +; CHECK-NEXT: strh w10, [x9] +; CHECK-NEXT: ret +entry: + %and1 = and i16 %a, -8192 + %and2 = and i16 %b, 1 + %or = or i16 %and2, %and1 + store i16 %or, ptr %p + ret i16 %and1 +} -- GitLab From f0312d962d0510d613a5ad1aec0f0e44d4f124c0 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Mon, 21 Oct 2024 15:55:40 +0200 Subject: [PATCH 237/511] [mlir][mlir-spirv-cpu-runner] Move MLIR pass pipeline to mlir-opt (#111575) Adds a new mlir-opt test-only pass, -test-spirv-cpu-runner-pipeline, which runs the set of MLIR passes needed for the mlir-spirv-cpu-runner, and removes them from the runner. The tests are changed to invoke mlir-opt with this flag before running the runner. The eventual goal is to move all host/device code generation steps out of the runner, like with some of the other runners. --- mlir/test/lib/Pass/CMakeLists.txt | 1 + .../lib/Pass/TestSPIRVCPURunnerPipeline.cpp | 47 +++++++++++++++++++ mlir/test/mlir-spirv-cpu-runner/double.mlir | 3 +- .../mlir-spirv-cpu-runner/simple_add.mlir | 3 +- mlir/tools/mlir-opt/mlir-opt.cpp | 2 + .../mlir-spirv-cpu-runner.cpp | 24 ---------- 6 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt index dd90c228cdaf..9f79944ff896 100644 --- a/mlir/test/lib/Pass/CMakeLists.txt +++ b/mlir/test/lib/Pass/CMakeLists.txt @@ -2,6 +2,7 @@ add_mlir_library(MLIRTestPass TestDynamicPipeline.cpp TestPassManager.cpp + TestSPIRVCPURunnerPipeline.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp b/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp new file mode 100644 index 000000000000..ded0d22c3130 --- /dev/null +++ b/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp @@ -0,0 +1,47 @@ +//===------------------ TestSPIRVCPURunnerPipeline.cpp --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements a pipeline for use by mlir-spirv-cpu-runner tests. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h" +#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" +#include "mlir/Dialect/SPIRV/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" + +using namespace mlir; + +namespace { + +void buildTestSPIRVCPURunnerPipeline(OpPassManager &passManager) { + passManager.addPass(createGpuKernelOutliningPass()); + passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true)); + + OpPassManager &nestedPM = passManager.nest(); + nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass()); + nestedPM.addPass(spirv::createSPIRVUpdateVCEPass()); + passManager.addPass(createLowerHostCodeToLLVMPass()); + passManager.addPass(createConvertSPIRVToLLVMPass()); +} + +} // namespace + +namespace mlir { +namespace test { +void registerTestSPIRVCPURunnerPipeline() { + PassPipelineRegistration<>( + "test-spirv-cpu-runner-pipeline", + "Runs a series of passes for lowering SPIR-V-dialect MLIR to " + "LLVM-dialect MLIR intended for mlir-spirv-cpu-runner.", + buildTestSPIRVCPURunnerPipeline); +} +} // namespace test +} // namespace mlir diff --git a/mlir/test/mlir-spirv-cpu-runner/double.mlir b/mlir/test/mlir-spirv-cpu-runner/double.mlir index cd551ffb1bd0..35557ba1e94c 100644 --- a/mlir/test/mlir-spirv-cpu-runner/double.mlir +++ b/mlir/test/mlir-spirv-cpu-runner/double.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ +// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \ +// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ // RUN: | FileCheck %s // CHECK: [8, 8, 8, 8, 8, 8] diff --git a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir index 119e973e45e4..75675a69a675 100644 --- a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir +++ b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ +// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \ +// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ // RUN: | FileCheck %s // CHECK: data = diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 36b142484bb0..002c3900056d 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -142,6 +142,7 @@ void registerTestSCFWhileOpBuilderPass(); void registerTestSCFWrapInZeroTripCheckPasses(); void registerTestShapeMappingPass(); void registerTestSliceAnalysisPass(); +void registerTestSPIRVCPURunnerPipeline(); void registerTestSPIRVFuncSignatureConversion(); void registerTestSPIRVVectorUnrolling(); void registerTestTensorCopyInsertionPass(); @@ -278,6 +279,7 @@ void registerTestPasses() { mlir::test::registerTestSCFWrapInZeroTripCheckPasses(); mlir::test::registerTestShapeMappingPass(); mlir::test::registerTestSliceAnalysisPass(); + mlir::test::registerTestSPIRVCPURunnerPipeline(); mlir::test::registerTestSPIRVFuncSignatureConversion(); mlir::test::registerTestSPIRVVectorUnrolling(); mlir::test::registerTestTensorCopyInsertionPass(); diff --git a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp index 7e0b51cac806..22ad1024db4a 100644 --- a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp +++ b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp @@ -12,18 +12,12 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" -#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h" -#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" -#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" -#include "mlir/Dialect/SPIRV/Transforms/Passes.h" #include "mlir/ExecutionEngine/JitRunner.h" #include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/Pass/Pass.h" @@ -75,23 +69,6 @@ convertMLIRModule(Operation *op, llvm::LLVMContext &context) { return mainModule; } -static LogicalResult runMLIRPasses(Operation *module, - JitRunnerOptions &options) { - PassManager passManager(module->getContext(), - module->getName().getStringRef()); - if (failed(applyPassManagerCLOptions(passManager))) - return failure(); - passManager.addPass(createGpuKernelOutliningPass()); - passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true)); - - OpPassManager &nestedPM = passManager.nest(); - nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass()); - nestedPM.addPass(spirv::createSPIRVUpdateVCEPass()); - passManager.addPass(createLowerHostCodeToLLVMPass()); - passManager.addPass(createConvertSPIRVToLLVMPass()); - return passManager.run(module); -} - int main(int argc, char **argv) { llvm::InitLLVM y(argc, argv); @@ -99,7 +76,6 @@ int main(int argc, char **argv) { llvm::InitializeNativeTargetAsmPrinter(); mlir::JitRunnerConfig jitRunnerConfig; - jitRunnerConfig.mlirTransformer = runMLIRPasses; jitRunnerConfig.llvmModuleBuilder = convertMLIRModule; mlir::DialectRegistry registry; -- GitLab From f2302ed3d0f84ca867a3e664ed65bc89e52ee670 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Mon, 21 Oct 2024 06:41:35 -0700 Subject: [PATCH 238/511] [RISCV][GISEL] Fix operand on RISCV::G_VMV_V_V_VL 6bac41496eb24c80aa659008d08220355a617c49 added this opcode with the wrong number of operands. It didn't fail on check-llvm for me or on pre-commit CI, but once committed we got buildbot failures. This patch fixes the definition of the instruction and fixes the failing test. --- llvm/lib/Target/RISCV/RISCVInstrGISel.td | 2 +- .../GlobalISel/legalizer/rvv/legalize-insert-subvector.mir | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td index 80f1901513b6..763aead84dd8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td +++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td @@ -70,7 +70,7 @@ def : GINodeEquiv; // Pseudo equivalent to a RISCVISD::VMV_V_V_VL def G_VMV_V_V_VL : RISCVGenericInstruction { let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$vec, type2:$vl); + let InOperandList = (ins type0:$passthru, type0:$vec, type1:$vl); let hasSideEffects = false; } def : GINodeEquiv; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir index 68c5ae120474..81a3a0c7ddd0 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir @@ -538,7 +538,7 @@ body: | ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64) - ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]](), [[LSHR]](s64) + ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]], [[LSHR]](s64) ; RV32-NEXT: [[INSERT_SUBVECTOR1:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[VMV_V_V_VL]](), 4 ; RV32-NEXT: $v8 = COPY [[INSERT_SUBVECTOR1]]() ; RV32-NEXT: PseudoRET implicit $v8 @@ -556,7 +556,7 @@ body: | ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32) - ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]](), [[LSHR]](s32) + ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_() = G_VMV_V_V_VL [[EXTRACT_SUBVECTOR]], [[INSERT_SUBVECTOR]], [[LSHR]](s32) ; RV64-NEXT: [[INSERT_SUBVECTOR1:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[VMV_V_V_VL]](), 4 ; RV64-NEXT: $v8 = COPY [[INSERT_SUBVECTOR1]]() ; RV64-NEXT: PseudoRET implicit $v8 -- GitLab From bd861d0e690cfd05184d86e954289cccfec97e92 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 21 Oct 2024 15:04:20 +0100 Subject: [PATCH 239/511] [AArch64] Add some basic patterns for qshrn. With the truncssat nodes these are relatively simple tablegen patterns to add. The existing intrinsics are converted to shift+truncsat to they can lower using the new patterns. Fixes #112925. --- .../Target/AArch64/AArch64ISelLowering.cpp | 21 +++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 +- llvm/test/CodeGen/AArch64/qshrn.ll | 90 +++++++------------ 3 files changed, 54 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 03e8885b92f2..bf2f0674b5b6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5950,6 +5950,27 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_neon_uqxtn: return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_neon_sqshrn: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VASHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_sqshrun: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VASHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_uqshrn: + if (Op.getValueType().isVector()) + return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), + DAG.getNode(AArch64ISD::VLSHR, dl, + Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); case Intrinsic::aarch64_sve_whilelo: return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/false); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 37dd43a203e5..76a1029415b1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8007,9 +8007,9 @@ defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", - int_aarch64_neon_sqshrn>; + BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", - int_aarch64_neon_sqshrun>; + BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>; defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), @@ -8030,7 +8030,7 @@ defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", - int_aarch64_neon_uqshrn>; + BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>; defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", TriOpFrag<(add node:$LHS, diff --git a/llvm/test/CodeGen/AArch64/qshrn.ll b/llvm/test/CodeGen/AArch64/qshrn.ll index eaba88da7b09..0212ff53b250 100644 --- a/llvm/test/CodeGen/AArch64/qshrn.ll +++ b/llvm/test/CodeGen/AArch64/qshrn.ll @@ -4,8 +4,7 @@ define <4 x i16> @NarrowAShrI32By5(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) @@ -26,8 +25,7 @@ define <4 x i16> @NarrowAShrU32By5(<4 x i32> %x) { define <4 x i16> @NarrowAShrI32By5ToU16(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By5ToU16: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) @@ -48,8 +46,7 @@ define <4 x i16> @NarrowLShrI32By5(<4 x i32> %x) { define <4 x i16> @NarrowLShrU32By5(<4 x i32> %x) { ; CHECK-LABEL: NarrowLShrU32By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.4s, v0.4s, #5 -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #5 ; CHECK-NEXT: ret %s = lshr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) @@ -71,8 +68,7 @@ define <4 x i16> @NarrowLShrI32By5ToU16(<4 x i32> %x) { define <2 x i32> @NarrowAShri64By5(<2 x i64> %x) { ; CHECK-LABEL: NarrowAShri64By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: sqshrn v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = ashr <2 x i64> %x, %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) @@ -93,8 +89,7 @@ define <2 x i32> @NarrowAShrU64By5(<2 x i64> %x) { define <2 x i32> @NarrowAShri64By5ToU32(<2 x i64> %x) { ; CHECK-LABEL: NarrowAShri64By5ToU32: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: sqshrun v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = ashr <2 x i64> %x, %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) @@ -115,8 +110,7 @@ define <2 x i32> @NarrowLShri64By5(<2 x i64> %x) { define <2 x i32> @NarrowLShrU64By5(<2 x i64> %x) { ; CHECK-LABEL: NarrowLShrU64By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.2d, v0.2d, #5 -; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: uqshrn v0.2s, v0.2d, #5 ; CHECK-NEXT: ret %s = lshr <2 x i64> %x, %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) @@ -138,8 +132,7 @@ define <2 x i32> @NarrowLShri64By5ToU32(<2 x i64> %x) { define <8 x i8> @NarrowAShri16By5(<8 x i16> %x) { ; CHECK-LABEL: NarrowAShri16By5: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: sqshrn v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = ashr <8 x i16> %x, %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) @@ -160,8 +153,7 @@ define <8 x i8> @NarrowAShrU16By5(<8 x i16> %x) { define <8 x i8> @NarrowAShri16By5ToU8(<8 x i16> %x) { ; CHECK-LABEL: NarrowAShri16By5ToU8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: sqshrun v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = ashr <8 x i16> %x, %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) @@ -182,8 +174,7 @@ define <8 x i8> @NarrowLShri16By5(<8 x i16> %x) { define <8 x i8> @NarrowLShrU16By5(<8 x i16> %x) { ; CHECK-LABEL: NarrowLShrU16By5: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.8h, v0.8h, #5 -; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: uqshrn v0.8b, v0.8h, #5 ; CHECK-NEXT: ret %s = lshr <8 x i16> %x, %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) @@ -208,8 +199,7 @@ define <8 x i8> @NarrowLShri16By5ToU8(<8 x i16> %x) { define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By31: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #16 -; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) @@ -219,8 +209,7 @@ define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) { define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) { ; CHECK-LABEL: NarrowAShrI32By31ToU16: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.4s, v0.4s, #16 -; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = ashr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) @@ -230,8 +219,7 @@ define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) { define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) { ; CHECK-LABEL: NarrowLShrU32By31: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %s = lshr <4 x i32> %x, %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) @@ -242,10 +230,8 @@ define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) { define <16 x i8> @signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: signed_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sshr v1.8h, v1.8h, #5 -; CHECK-NEXT: sqxtn v0.8b, v0.8h -; CHECK-NEXT: sqxtn2 v0.16b, v1.8h +; CHECK-NEXT: sqshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: sqshrn2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = ashr <16 x i16> %x, @@ -258,10 +244,8 @@ entry: define <16 x i8> @unsigned_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: unsigned_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.8h, v0.8h, #5 -; CHECK-NEXT: ushr v1.8h, v1.8h, #5 -; CHECK-NEXT: uqxtn v0.8b, v0.8h -; CHECK-NEXT: uqxtn2 v0.16b, v1.8h +; CHECK-NEXT: uqshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: uqshrn2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = lshr <16 x i16> %x, @@ -273,10 +257,8 @@ entry: define <16 x i8> @unsigned_signed_minmax_v8i16_to_v16i8(<16 x i16> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v8i16_to_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.8h, v0.8h, #5 -; CHECK-NEXT: sshr v1.8h, v1.8h, #5 -; CHECK-NEXT: sqxtun v0.8b, v0.8h -; CHECK-NEXT: sqxtun2 v0.16b, v1.8h +; CHECK-NEXT: sqshrun v0.8b, v0.8h, #5 +; CHECK-NEXT: sqshrun2 v0.16b, v1.8h, #5 ; CHECK-NEXT: ret entry: %s = ashr <16 x i16> %x, @@ -290,10 +272,8 @@ entry: define <8 x i16> @signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: signed_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sshr v1.4s, v1.4s, #5 -; CHECK-NEXT: sqxtn v0.4h, v0.4s -; CHECK-NEXT: sqxtn2 v0.8h, v1.4s +; CHECK-NEXT: sqshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: sqshrn2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = ashr <8 x i32> %x, @@ -306,10 +286,8 @@ entry: define <8 x i16> @unsigned_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: unsigned_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.4s, v0.4s, #5 -; CHECK-NEXT: ushr v1.4s, v1.4s, #5 -; CHECK-NEXT: uqxtn v0.4h, v0.4s -; CHECK-NEXT: uqxtn2 v0.8h, v1.4s +; CHECK-NEXT: uqshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: uqshrn2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = lshr <8 x i32> %x, @@ -321,10 +299,8 @@ entry: define <8 x i16> @unsigned_signed_minmax_v4i32_to_v8i16(<8 x i32> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v4i32_to_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.4s, v0.4s, #5 -; CHECK-NEXT: sshr v1.4s, v1.4s, #5 -; CHECK-NEXT: sqxtun v0.4h, v0.4s -; CHECK-NEXT: sqxtun2 v0.8h, v1.4s +; CHECK-NEXT: sqshrun v0.4h, v0.4s, #5 +; CHECK-NEXT: sqshrun2 v0.8h, v1.4s, #5 ; CHECK-NEXT: ret entry: %s = ashr <8 x i32> %x, @@ -338,10 +314,8 @@ entry: define <4 x i32> @signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: signed_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sshr v1.2d, v1.2d, #5 -; CHECK-NEXT: sqxtn v0.2s, v0.2d -; CHECK-NEXT: sqxtn2 v0.4s, v1.2d +; CHECK-NEXT: sqshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: sqshrn2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = ashr <4 x i64> %x, @@ -354,10 +328,8 @@ entry: define <4 x i32> @unsigned_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: unsigned_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushr v0.2d, v0.2d, #5 -; CHECK-NEXT: ushr v1.2d, v1.2d, #5 -; CHECK-NEXT: uqxtn v0.2s, v0.2d -; CHECK-NEXT: uqxtn2 v0.4s, v1.2d +; CHECK-NEXT: uqshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: uqshrn2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = lshr <4 x i64> %x, @@ -369,10 +341,8 @@ entry: define <4 x i32> @unsigned_signed_minmax_v4i64_to_v8i32(<4 x i64> %x) { ; CHECK-LABEL: unsigned_signed_minmax_v4i64_to_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshr v0.2d, v0.2d, #5 -; CHECK-NEXT: sshr v1.2d, v1.2d, #5 -; CHECK-NEXT: sqxtun v0.2s, v0.2d -; CHECK-NEXT: sqxtun2 v0.4s, v1.2d +; CHECK-NEXT: sqshrun v0.2s, v0.2d, #5 +; CHECK-NEXT: sqshrun2 v0.4s, v1.2d, #5 ; CHECK-NEXT: ret entry: %s = ashr <4 x i64> %x, -- GitLab From 1dfdbf716112627dea5e79f7f4f1e1e9335ee9df Mon Sep 17 00:00:00 2001 From: Boaz Brickner Date: Mon, 21 Oct 2024 16:05:24 +0200 Subject: [PATCH 240/511] [clang] Add covariance tests that make sure we return an error when return value is different in pointer / lvalue ref / rvalue ref (#112853) Per https://cplusplus.github.io/CWG/issues/960.html. --- clang/test/CXX/drs/cwg9xx.cpp | 50 +++++++++++++++++++++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp index 2700b0f5662a..d4f54bcdad6e 100644 --- a/clang/test/CXX/drs/cwg9xx.cpp +++ b/clang/test/CXX/drs/cwg9xx.cpp @@ -93,6 +93,56 @@ struct B : A { } // namespace example2 } // namespace cwg952 +namespace cwg960 { // cwg960: 3.0 +struct a {}; +class A { +#if __cplusplus >= 201103L + // Check lvalue ref vs rvalue ref vs pointer. + virtual a& rvalue_ref(); + virtual a&& lvalue_ref(); + virtual a& rvalue_vs_lvalue_ref(); // #cwg960-A-rvalue_vs_lvalue_ref + virtual a&& lvalue_vs_rvalue_ref(); // #cwg960-A-lvalue_vs_rvalue_ref + virtual a& rvalue_ref_vs_pointer(); // #cwg960-A-rvalue_ref_vs_pointer + virtual a* pointer_vs_rvalue_ref(); // #cwg960-A-pointer_vs_rvalue_ref + virtual a&& lvalue_ref_vs_pointer(); // #cwg960-A-lvalue_ref_vs_pointer + virtual a* pointer_vs_lvalue_ref(); // #cwg960-A-pointer_vs_lvalue_ref +#endif +}; + +class B : A { +#if __cplusplus >= 201103L + // Check lvalue ref vs rvalue ref vs pointer. + a& rvalue_ref() override; + a&& lvalue_ref() override; + + a&& rvalue_vs_lvalue_ref() override; + // since-cxx11-error@-1 {{virtual function 'rvalue_vs_lvalue_ref' has a different return type ('a &&') than the function it overrides (which has return type 'a &')}} + // since-cxx11-note@#cwg960-A-rvalue_vs_lvalue_ref {{overridden virtual function is here}} + + a& lvalue_vs_rvalue_ref() override; + // since-cxx11-error@-1 {{virtual function 'lvalue_vs_rvalue_ref' has a different return type ('a &') than the function it overrides (which has return type 'a &&')}} + // since-cxx11-note@#cwg960-A-lvalue_vs_rvalue_ref {{overridden virtual function is here}} + + a* rvalue_ref_vs_pointer() override; + // since-cxx11-error@-1 {{virtual function 'rvalue_ref_vs_pointer' has a different return type ('a *') than the function it overrides (which has return type 'a &')}} + // since-cxx11-note@#cwg960-A-rvalue_ref_vs_pointer {{overridden virtual function is here}} + + a& pointer_vs_rvalue_ref() override; + // since-cxx11-error@-1 {{virtual function 'pointer_vs_rvalue_ref' has a different return type ('a &') than the function it overrides (which has return type 'a *')}} + // since-cxx11-note@#cwg960-A-pointer_vs_rvalue_ref {{overridden virtual function is here}} + + a* lvalue_ref_vs_pointer() override; + // since-cxx11-error@-1 {{virtual function 'lvalue_ref_vs_pointer' has a different return type ('a *') than the function it overrides (which has return type 'a &&')}} + // since-cxx11-note@#cwg960-A-lvalue_ref_vs_pointer {{overridden virtual function is here}} + + a&& pointer_vs_lvalue_ref() override; + // since-cxx11-error@-1 {{virtual function 'pointer_vs_lvalue_ref' has a different return type ('a &&') than the function it overrides (which has return type 'a *')}} + // since-cxx11-note@#cwg960-A-pointer_vs_lvalue_ref {{overridden virtual function is here}} +#endif +}; + +} // namespace cwg960 + namespace cwg974 { // cwg974: yes #if __cplusplus >= 201103L void test() { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 6f3cc8247d2e..714fb5c14aff 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -5579,7 +5579,7 @@ and POD class 960 CD2 Covariant functions and lvalue/rvalue references - Unknown + Clang 3.0 961 -- GitLab From 67ff5ba9af9754261abe11d762af11532a816126 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Mon, 21 Oct 2024 15:18:19 +0100 Subject: [PATCH 241/511] [AArch64] Add assembly/disaasembly of atomic ld/st (#112892) This patch adds assembly/disassembly for the following instructions: ldfadd{a,al,l,}, ldbfadd{a,al,l,} ldfmax{a,al,l,}, ldbfmax{a,al,l,} ldfmaxnm{a,al,l,}, ldbfmaxnm{a,al,l,} ldfmin{a,al,l,}, ldbfmin{a,al,l,} ldfminnm{a,al,l,} ldbfminnm{a,al,l,} stfadd{l,}, stbfadd{l,} stfmax{l,}, stbfmax{l,} stfmaxnm{l,}, stbfmaxnm{l,} stfmin{l,}, stbfmin{l,} stfminnm{l,}, stbfminnm{l,} According to [1] [1]https://developer.arm.com/documentation/ddi0602 Co-authored-by: Spencer Abson [spencer.abson@arm.com](mailto:spencer.abson@arm.com) Co-authored-by: Caroline Concatto [caroline.concatto@arm.com](mailto:caroline.concatto@arm.com) --- .../lib/Target/AArch64/AArch64InstrFormats.td | 61 +++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 72 ++++++ .../MC/AArch64/LSFE/directive-arch-negative.s | 7 + llvm/test/MC/AArch64/LSFE/directive-arch.s | 5 + .../LSFE/directive-arch_extension-negative.s | 7 + .../AArch64/LSFE/directive-arch_extension.s | 5 + .../MC/AArch64/LSFE/directive-cpu-negative.s | 7 + llvm/test/MC/AArch64/LSFE/directive-cpu.s | 5 + .../test/MC/AArch64/LSFE/ldfadd-diagnostics.s | 241 ++++++++++++++++++ llvm/test/MC/AArch64/LSFE/ldfadd.s | 225 ++++++++++++++++ .../test/MC/AArch64/LSFE/ldfmax-diagnostics.s | 241 ++++++++++++++++++ llvm/test/MC/AArch64/LSFE/ldfmax.s | 225 ++++++++++++++++ .../MC/AArch64/LSFE/ldfmaxnm-diagnostics.s | 241 ++++++++++++++++++ llvm/test/MC/AArch64/LSFE/ldfmaxnm.s | 225 ++++++++++++++++ .../test/MC/AArch64/LSFE/ldfmin-diagnostics.s | 241 ++++++++++++++++++ llvm/test/MC/AArch64/LSFE/ldfmin.s | 225 ++++++++++++++++ .../MC/AArch64/LSFE/ldfminnm-diagnostics.s | 241 ++++++++++++++++++ llvm/test/MC/AArch64/LSFE/ldfminnm.s | 225 ++++++++++++++++ .../test/MC/AArch64/LSFE/stfadd-diagnostics.s | 73 ++++++ llvm/test/MC/AArch64/LSFE/stfadd.s | 121 +++++++++ .../test/MC/AArch64/LSFE/stfmax-diagnostics.s | 73 ++++++ llvm/test/MC/AArch64/LSFE/stfmax.s | 121 +++++++++ .../MC/AArch64/LSFE/stfmaxnm-diagnostics.s | 73 ++++++ llvm/test/MC/AArch64/LSFE/stfmaxnm.s | 121 +++++++++ .../test/MC/AArch64/LSFE/stfmin-diagnostics.s | 73 ++++++ llvm/test/MC/AArch64/LSFE/stfmin.s | 121 +++++++++ .../MC/AArch64/LSFE/stfminnm-diagnostics.s | 73 ++++++ llvm/test/MC/AArch64/LSFE/stfminnm.s | 121 +++++++++ 28 files changed, 3469 insertions(+) create mode 100644 llvm/test/MC/AArch64/LSFE/directive-arch-negative.s create mode 100644 llvm/test/MC/AArch64/LSFE/directive-arch.s create mode 100644 llvm/test/MC/AArch64/LSFE/directive-arch_extension-negative.s create mode 100644 llvm/test/MC/AArch64/LSFE/directive-arch_extension.s create mode 100644 llvm/test/MC/AArch64/LSFE/directive-cpu-negative.s create mode 100644 llvm/test/MC/AArch64/LSFE/directive-cpu.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfadd-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfadd.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmax-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmax.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmaxnm-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmaxnm.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmin-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfmin.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfminnm-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/ldfminnm.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfadd-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfadd.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmax-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmax.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmaxnm-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmaxnm.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmin-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfmin.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfminnm-diagnostics.s create mode 100644 llvm/test/MC/AArch64/LSFE/stfminnm.s diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1d1d9b5512cf..4b24b166143d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12626,3 +12626,64 @@ def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; def : TokenAlias<".D", ".d">; def : TokenAlias<".Q", ".q">; + +//---------------------------------------------------------------------------- +// 2024 Armv9.6 Extensions +//---------------------------------------------------------------------------- + +let mayLoad = 1, mayStore = 1 in +class BaseAtomicFPLoad sz, bits<2> AR, + bits<3> op0, string asm> +: I<(outs regtype:$Rt), + (ins regtype:$Rs, GPR64sp:$Rn), + asm, "\t$Rs, $Rt, [$Rn]","", []>, + Sched<[]> { + bits<5> Rt; + bits<5> Rs; + bits<5> Rn; + let Inst{31-30} = sz; + let Inst{29-24} = 0b111100; + let Inst{23-22} = AR; + let Inst{21} = 0b1; + let Inst{20-16} = Rs; + let Inst{15} = 0b0; + let Inst{14-12} = op0; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass AtomicFPLoad AR, bits<3> op0, string asm> { + def D : BaseAtomicFPLoad; + def S : BaseAtomicFPLoad; + def H : BaseAtomicFPLoad; +} + +let mayLoad = 1, mayStore = 1 in +class BaseAtomicFPStore sz, bit R, + bits<3> op0, string asm> +: I<(outs), + (ins regtype:$Rs, GPR64sp:$Rn), + asm, "\t$Rs, [$Rn]", + "", []>, + Sched<[]> { + bits<5> Rt; + bits<5> Rs; + bits<5> Rn; + let Inst{31-30} = sz; + let Inst{29-23} = 0b1111000; + let Inst{22} = R; + let Inst{21} = 0b1; + let Inst{20-16} = Rs; + let Inst{15} = 0b1; + let Inst{14-12} = op0; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = 0b11111; +} + +multiclass AtomicFPStore op0, string asm> { + def D : BaseAtomicFPStore; + def S : BaseAtomicFPStore; + def H : BaseAtomicFPStore; +} diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 76a1029415b1..1eb93066cfd8 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10299,6 +10299,78 @@ defm : PromoteBinaryv8f16Tov4f32; defm : PromoteBinaryv8f16Tov4f32; defm : PromoteBinaryv8f16Tov4f32; +//===-----------------------------------------------------===// +// Atomic floating-point in-memory instructions (FEAT_LSFE) +//===-----------------------------------------------------===// + +let Predicates = [HasLSFE] in { + // Floating-point Atomic Load + defm LDFADDA : AtomicFPLoad<0b10, 0b000, "ldfadda">; + defm LDFADDAL : AtomicFPLoad<0b11, 0b000, "ldfaddal">; + defm LDFADD : AtomicFPLoad<0b00, 0b000, "ldfadd">; + defm LDFADDL : AtomicFPLoad<0b01, 0b000, "ldfaddl">; + defm LDFMAXA : AtomicFPLoad<0b10, 0b100, "ldfmaxa">; + defm LDFMAXAL : AtomicFPLoad<0b11, 0b100, "ldfmaxal">; + defm LDFMAX : AtomicFPLoad<0b00, 0b100, "ldfmax">; + defm LDFMAXL : AtomicFPLoad<0b01, 0b100, "ldfmaxl">; + defm LDFMINA : AtomicFPLoad<0b10, 0b101, "ldfmina">; + defm LDFMINAL : AtomicFPLoad<0b11, 0b101, "ldfminal">; + defm LDFMIN : AtomicFPLoad<0b00, 0b101, "ldfmin">; + defm LDFMINL : AtomicFPLoad<0b01, 0b101, "ldfminl">; + defm LDFMAXNMA : AtomicFPLoad<0b10, 0b110, "ldfmaxnma">; + defm LDFMAXNMAL : AtomicFPLoad<0b11, 0b110, "ldfmaxnmal">; + defm LDFMAXNM : AtomicFPLoad<0b00, 0b110, "ldfmaxnm">; + defm LDFMAXNML : AtomicFPLoad<0b01, 0b110, "ldfmaxnml">; + defm LDFMINNMA : AtomicFPLoad<0b10, 0b111, "ldfminnma">; + defm LDFMINNMAL : AtomicFPLoad<0b11, 0b111, "ldfminnmal">; + defm LDFMINMN : AtomicFPLoad<0b00, 0b111, "ldfminnm">; + defm LDFMINNML : AtomicFPLoad<0b01, 0b111, "ldfminnml">; + // BFloat16 + def LDBFADDA : BaseAtomicFPLoad; + def LDBFADDAL : BaseAtomicFPLoad; + def LDBFADD : BaseAtomicFPLoad; + def LDBFADDL : BaseAtomicFPLoad; + def LDBFMAXA : BaseAtomicFPLoad; + def LDBFMAXAL : BaseAtomicFPLoad; + def LDBFMAX : BaseAtomicFPLoad; + def LDBFMAXL : BaseAtomicFPLoad; + def LDBFMINA : BaseAtomicFPLoad; + def LDBFMINAL : BaseAtomicFPLoad; + def LDBFMIN : BaseAtomicFPLoad; + def LDBFMINL : BaseAtomicFPLoad; + def LDBFMAXNMA : BaseAtomicFPLoad; + def LDBFMAXNMAL : BaseAtomicFPLoad; + def LDBFMAXNM : BaseAtomicFPLoad; + def LDBFMAXNML : BaseAtomicFPLoad; + def LDBFMINNMA : BaseAtomicFPLoad; + def LDBFMINNMAL : BaseAtomicFPLoad; + def LDBFMINNM : BaseAtomicFPLoad; + def LDBFMINNML : BaseAtomicFPLoad; + + // Floating-point Atomic Store + defm STFADD : AtomicFPStore<0b0, 0b000, "stfadd">; + defm STFADDL : AtomicFPStore<0b1, 0b000, "stfaddl">; + defm STFMAX : AtomicFPStore<0b0, 0b100, "stfmax">; + defm STFMAXL : AtomicFPStore<0b1, 0b100, "stfmaxl">; + defm STFMIN : AtomicFPStore<0b0, 0b101, "stfmin">; + defm STFMINL : AtomicFPStore<0b1, 0b101, "stfminl">; + defm STFMAXNM : AtomicFPStore<0b0, 0b110, "stfmaxnm">; + defm STFMAXNML : AtomicFPStore<0b1, 0b110, "stfmaxnml">; + defm STFMINNM : AtomicFPStore<0b0, 0b111, "stfminnm">; + defm STFMINNML : AtomicFPStore<0b1, 0b111, "stfminnml">; + // BFloat16 + def STBFADD : BaseAtomicFPStore; + def STBFADDL : BaseAtomicFPStore; + def STBFMAX : BaseAtomicFPStore; + def STBFMAXL : BaseAtomicFPStore; + def STBFMIN : BaseAtomicFPStore; + def STBFMINL : BaseAtomicFPStore; + def STBFMAXNM : BaseAtomicFPStore; + def STBFMAXNML : BaseAtomicFPStore; + def STBFMINNM : BaseAtomicFPStore; + def STBFMINNML : BaseAtomicFPStore; +} + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" include "AArch64SMEInstrInfo.td" diff --git a/llvm/test/MC/AArch64/LSFE/directive-arch-negative.s b/llvm/test/MC/AArch64/LSFE/directive-arch-negative.s new file mode 100644 index 000000000000..2520d777f4cf --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-arch-negative.s @@ -0,0 +1,7 @@ +// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +.arch armv9.6-a+lsfe +.arch armv9.6-a+nolsfe +ldfadd h0, h1, [x2] +// CHECK: error: instruction requires: lsfe +// CHECK: ldfadd h0, h1, [x2] \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/directive-arch.s b/llvm/test/MC/AArch64/LSFE/directive-arch.s new file mode 100644 index 000000000000..4fd6135e1a51 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-arch.s @@ -0,0 +1,5 @@ +// RUN: llvm-mc -triple aarch64 -o - %s 2>&1 | FileCheck %s + +.arch armv9.6-a+lsfe +ldfadd h0, h1, [x2] +// CHECK: ldfadd h0, h1, [x2] \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/LSFE/directive-arch_extension-negative.s new file mode 100644 index 000000000000..f74dfb13e824 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-arch_extension-negative.s @@ -0,0 +1,7 @@ +// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +.arch_extension lsfe +.arch_extension nolsfe +ldfadd h0, h1, [x2] +// CHECK: error: instruction requires: lsfe +// CHECK-NEXT: ldfadd h0, h1, [x2] \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/directive-arch_extension.s b/llvm/test/MC/AArch64/LSFE/directive-arch_extension.s new file mode 100644 index 000000000000..1dfca73aeb34 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-arch_extension.s @@ -0,0 +1,5 @@ +// RUN: llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +.arch_extension lsfe +ldfadd h0, h1, [x2] +// CHECK: ldfadd h0, h1, [x2] \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/directive-cpu-negative.s b/llvm/test/MC/AArch64/LSFE/directive-cpu-negative.s new file mode 100644 index 000000000000..443161b51e94 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-cpu-negative.s @@ -0,0 +1,7 @@ +// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +.cpu generic+lsfe +.cpu generic+nolsfe +ldfadd h0, h1, [x2] +// CHECK: error: instruction requires: lsfe +// CHECK-NEXT: ldfadd h0, h1, [x2] \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/directive-cpu.s b/llvm/test/MC/AArch64/LSFE/directive-cpu.s new file mode 100644 index 000000000000..ae58cd67c2c7 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/directive-cpu.s @@ -0,0 +1,5 @@ +// RUN: llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s + +.cpu generic+lsfe +ldfadd h0, h1, [x2] +// CHECK: ldfadd h0, h1, [x2] diff --git a/llvm/test/MC/AArch64/LSFE/ldfadd-diagnostics.s b/llvm/test/MC/AArch64/LSFE/ldfadd-diagnostics.s new file mode 100644 index 000000000000..eb36a0286db7 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfadd-diagnostics.s @@ -0,0 +1,241 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// LDFADD +//------------------------------------------------------------------------------ + +ldfadd h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadd h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadd s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadd s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadd d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadd d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadd d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadd d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadd s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadd s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfadda + +ldfadda h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadda h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadda s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadda s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadda d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadda d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadda d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadda d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfadda s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfadda s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfaddal + +ldfaddal h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddal h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddal s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddal s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddal d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddal d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddal d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddal d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddal s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddal s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfaddl + +ldfaddl h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddl h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddl s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddl s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddl d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddl d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddl d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddl d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfaddl s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfaddl s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// LDBFADD +//------------------------------------------------------------------------------ + +ldbfadd s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadd h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadd s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadd d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadd h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadd h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadd h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfadda + +ldbfadda s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadda h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadda s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadda d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadda h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfadda h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfadda h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfaddal + +ldbfaddal s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddal h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddal s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddal d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddal h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddal h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddal h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfaddl + +ldbfaddl s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddl h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddl s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddl d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddl h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfaddl h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfaddl h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfadd.s b/llvm/test/MC/AArch64/LSFE/ldfadd.s new file mode 100644 index 000000000000..e80ca3b70533 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfadd.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// LDFADD +//------------------------------------------------------------------------------ + +ldfadd h0, h1, [x2] +// CHECK-INST: ldfadd h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c200041 + +ldfadd h2, h3, [sp] +// CHECK-INST: ldfadd h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2203e3 + +ldfadd s0, s1, [x2] +// CHECK-INST: ldfadd s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc200041 + +ldfadd s2, s3, [sp] +// CHECK-INST: ldfadd s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2203e3 + +ldfadd d0, d1, [x2] +// CHECK-INST: ldfadd d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc200041 + +ldfadd d2, d3, [sp] +// CHECK-INST: ldfadd d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2203e3 + +// -- ldfadda + +ldfadda h0, h1, [x2] +// CHECK-INST: ldfadda h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xa0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca00041 + +ldfadda h2, h3, [sp] +// CHECK-INST: ldfadda h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xa2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca203e3 + +ldfadda s0, s1, [x2] +// CHECK-INST: ldfadda s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xa0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca00041 + +ldfadda s2, s3, [sp] +// CHECK-INST: ldfadda s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xa2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca203e3 + +ldfadda d0, d1, [x2] +// CHECK-INST: ldfadda d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xa0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca00041 + +ldfadda d2, d3, [sp] +// CHECK-INST: ldfadda d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xa2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca203e3 + +// -- ldfaddal + +ldfaddal h0, h1, [x2] +// CHECK-INST: ldfaddal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xe0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce00041 + +ldfaddal h2, h3, [sp] +// CHECK-INST: ldfaddal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xe2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce203e3 + +ldfaddal s0, s1, [x2] +// CHECK-INST: ldfaddal s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xe0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce00041 + +ldfaddal s2, s3, [sp] +// CHECK-INST: ldfaddal s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xe2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce203e3 + +ldfaddal d0, d1, [x2] +// CHECK-INST: ldfaddal d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xe0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce00041 + +ldfaddal d2, d3, [sp] +// CHECK-INST: ldfaddal d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xe2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce203e3 + +// -- ldfaddl + +ldfaddl h0, h1, [x2] +// CHECK-INST: ldfaddl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c600041 + +ldfaddl h2, h3, [sp] +// CHECK-INST: ldfaddl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6203e3 + +ldfaddl s0, s1, [x2] +// CHECK-INST: ldfaddl s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc600041 + +ldfaddl s2, s3, [sp] +// CHECK-INST: ldfaddl s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6203e3 + +ldfaddl d0, d1, [x2] +// CHECK-INST: ldfaddl d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc600041 + +ldfaddl d2, d3, [sp] +// CHECK-INST: ldfaddl d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6203e3 + +ldbfadd h2, h3, [sp] +// CHECK-INST: ldbfadd h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2203e3 + +// -- ldbfadda + +ldbfadda h0, h1, [x2] +// CHECK-INST: ldbfadda h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xa0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca00041 + +ldbfadda h2, h3, [sp] +// CHECK-INST: ldbfadda h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xa2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca203e3 + +// -- ldbfaddal + +ldbfaddal h0, h1, [x2] +// CHECK-INST: ldbfaddal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0xe0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce00041 + +ldbfaddal h2, h3, [sp] +// CHECK-INST: ldbfaddal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0xe2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce203e3 + +// -- ldbfaddl + +ldbfaddl h0, h1, [x2] +// CHECK-INST: ldbfaddl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x00,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c600041 + +ldbfaddl h2, h3, [sp] +// CHECK-INST: ldbfaddl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x03,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6203e3 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmax-diagnostics.s b/llvm/test/MC/AArch64/LSFE/ldfmax-diagnostics.s new file mode 100644 index 000000000000..e062e400b67f --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmax-diagnostics.s @@ -0,0 +1,241 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// LDFMAX +//------------------------------------------------------------------------------ + +ldfmax h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmax h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmax s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmax s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmax d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmax d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmax d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmax d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmax s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmax s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxa + +ldfmaxa h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxa h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxa s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxa s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxa d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxa d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxa d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxa d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxa s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxa s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxal + +ldfmaxal h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxal h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxal s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxal s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxal d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxal d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxal d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxal d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxal s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxal s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxl + +ldfmaxl h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxl h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxl s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxl s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxl d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxl d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxl d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxl d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxl s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxl s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// LDBFMAX +//------------------------------------------------------------------------------ + +ldbfmax s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmax h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmax s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmax d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmax h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmax h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmax h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxa + +ldbfmaxa s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxa h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxa s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxa d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxa h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxa h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxa h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxal + +ldbfmaxal s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxal h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxal s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxal d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxal h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxal h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxal h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxl + +ldbfmaxl s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxl h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxl s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxl d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxl h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxl h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxl h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmax.s b/llvm/test/MC/AArch64/LSFE/ldfmax.s new file mode 100644 index 000000000000..40c9e05d1b23 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmax.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// LDFMAX +//------------------------------------------------------------------------------ + +ldfmax h0, h1, [x2] +// CHECK-INST: ldfmax h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c204041 + +ldfmax h2, h3, [sp] +// CHECK-INST: ldfmax h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2243e3 + +ldfmax s0, s1, [x2] +// CHECK-INST: ldfmax s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc204041 + +ldfmax s2, s3, [sp] +// CHECK-INST: ldfmax s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2243e3 + +ldfmax d0, d1, [x2] +// CHECK-INST: ldfmax d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc204041 + +ldfmax d2, d3, [sp] +// CHECK-INST: ldfmax d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2243e3 + +// -- ldfmaxa + +ldfmaxa h0, h1, [x2] +// CHECK-INST: ldfmaxa h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xa0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca04041 + +ldfmaxa h2, h3, [sp] +// CHECK-INST: ldfmaxa h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xa2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca243e3 + +ldfmaxa s0, s1, [x2] +// CHECK-INST: ldfmaxa s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xa0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca04041 + +ldfmaxa s2, s3, [sp] +// CHECK-INST: ldfmaxa s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xa2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca243e3 + +ldfmaxa d0, d1, [x2] +// CHECK-INST: ldfmaxa d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xa0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca04041 + +ldfmaxa d2, d3, [sp] +// CHECK-INST: ldfmaxa d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xa2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca243e3 + +// -- ldfmaxal + +ldfmaxal h0, h1, [x2] +// CHECK-INST: ldfmaxal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xe0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce04041 + +ldfmaxal h2, h3, [sp] +// CHECK-INST: ldfmaxal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xe2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce243e3 + +ldfmaxal s0, s1, [x2] +// CHECK-INST: ldfmaxal s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xe0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce04041 + +ldfmaxal s2, s3, [sp] +// CHECK-INST: ldfmaxal s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xe2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce243e3 + +ldfmaxal d0, d1, [x2] +// CHECK-INST: ldfmaxal d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xe0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce04041 + +ldfmaxal d2, d3, [sp] +// CHECK-INST: ldfmaxal d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xe2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce243e3 + +// -- ldfmaxl + +ldfmaxl h0, h1, [x2] +// CHECK-INST: ldfmaxl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c604041 + +ldfmaxl h2, h3, [sp] +// CHECK-INST: ldfmaxl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6243e3 + +ldfmaxl s0, s1, [x2] +// CHECK-INST: ldfmaxl s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc604041 + +ldfmaxl s2, s3, [sp] +// CHECK-INST: ldfmaxl s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6243e3 + +ldfmaxl d0, d1, [x2] +// CHECK-INST: ldfmaxl d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc604041 + +ldfmaxl d2, d3, [sp] +// CHECK-INST: ldfmaxl d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6243e3 + +//------------------------------------------------------------------------------ +// LDBFMAX +//------------------------------------------------------------------------------ + +ldbfmax h0, h1, [x2] +// CHECK-INST: ldbfmax h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c204041 + +ldbfmax h2, h3, [sp] +// CHECK-INST: ldbfmax h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2243e3 + +// -- ldbfmaxa + +ldbfmaxa h0, h1, [x2] +// CHECK-INST: ldbfmaxa h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xa0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca04041 + +ldbfmaxa h2, h3, [sp] +// CHECK-INST: ldbfmaxa h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xa2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca243e3 + +// -- ldbfmaxal + +ldbfmaxal h0, h1, [x2] +// CHECK-INST: ldbfmaxal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0xe0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce04041 + +ldbfmaxal h2, h3, [sp] +// CHECK-INST: ldbfmaxal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0xe2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce243e3 + +// -- ldbfmaxl + +ldbfmaxl h0, h1, [x2] +// CHECK-INST: ldbfmaxl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x40,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c604041 + +ldbfmaxl h2, h3, [sp] +// CHECK-INST: ldbfmaxl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x43,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6243e3 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmaxnm-diagnostics.s b/llvm/test/MC/AArch64/LSFE/ldfmaxnm-diagnostics.s new file mode 100644 index 000000000000..2bd4da5624b0 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmaxnm-diagnostics.s @@ -0,0 +1,241 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// LDFMAXNM +//------------------------------------------------------------------------------ + +ldfmaxnm h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnm h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnm s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnm s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnm d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnm d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnm d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnm d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnm s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnm s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxnma + +ldfmaxnma h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnma h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnma s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnma s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnma d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnma d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnma d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnma d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnma s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnma s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxnmal + +ldfmaxnmal h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnmal h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnmal s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnmal s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnmal d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnmal d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnmal d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnmal d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnmal s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnmal s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmaxnml + +ldfmaxnml h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnml h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnml s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnml s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnml d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnml d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnml d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnml d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmaxnml s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmaxnml s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// LDBFMAXNM +//------------------------------------------------------------------------------ + +ldbfmaxnm s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnm h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnm s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnm d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnm h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnm h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnm h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxnma + +ldbfmaxnma s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnma h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnma s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnma d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnma h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnma h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnma h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxnmal + +ldbfmaxnmal s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnmal h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnmal s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnmal d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnmal h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnmal h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnmal h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmaxnml + +ldbfmaxnml s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnml h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnml s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnml d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmaxnml h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +ldbfmaxnml h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmaxnml h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmaxnm.s b/llvm/test/MC/AArch64/LSFE/ldfmaxnm.s new file mode 100644 index 000000000000..bfa1c50d41bf --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmaxnm.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// LDFMAXNM +//------------------------------------------------------------------------------ + +ldfmaxnm h0, h1, [x2] +// CHECK-INST: ldfmaxnm h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c206041 + +ldfmaxnm h2, h3, [sp] +// CHECK-INST: ldfmaxnm h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2263e3 + +ldfmaxnm s0, s1, [x2] +// CHECK-INST: ldfmaxnm s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc206041 + +ldfmaxnm s2, s3, [sp] +// CHECK-INST: ldfmaxnm s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2263e3 + +ldfmaxnm d0, d1, [x2] +// CHECK-INST: ldfmaxnm d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc206041 + +ldfmaxnm d2, d3, [sp] +// CHECK-INST: ldfmaxnm d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2263e3 + +// -- ldfmaxnma + +ldfmaxnma h0, h1, [x2] +// CHECK-INST: ldfmaxnma h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xa0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca06041 + +ldfmaxnma h2, h3, [sp] +// CHECK-INST: ldfmaxnma h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xa2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca263e3 + +ldfmaxnma s0, s1, [x2] +// CHECK-INST: ldfmaxnma s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xa0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca06041 + +ldfmaxnma s2, s3, [sp] +// CHECK-INST: ldfmaxnma s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xa2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca263e3 + +ldfmaxnma d0, d1, [x2] +// CHECK-INST: ldfmaxnma d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xa0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca06041 + +ldfmaxnma d2, d3, [sp] +// CHECK-INST: ldfmaxnma d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xa2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca263e3 + +// -- ldfmaxnmal + +ldfmaxnmal h0, h1, [x2] +// CHECK-INST: ldfmaxnmal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xe0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce06041 + +ldfmaxnmal h2, h3, [sp] +// CHECK-INST: ldfmaxnmal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xe2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce263e3 + +ldfmaxnmal s0, s1, [x2] +// CHECK-INST: ldfmaxnmal s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xe0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce06041 + +ldfmaxnmal s2, s3, [sp] +// CHECK-INST: ldfmaxnmal s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xe2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce263e3 + +ldfmaxnmal d0, d1, [x2] +// CHECK-INST: ldfmaxnmal d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xe0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce06041 + +ldfmaxnmal d2, d3, [sp] +// CHECK-INST: ldfmaxnmal d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xe2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce263e3 + +// -- ldfmaxnml + +ldfmaxnml h0, h1, [x2] +// CHECK-INST: ldfmaxnml h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c606041 + +ldfmaxnml h2, h3, [sp] +// CHECK-INST: ldfmaxnml h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6263e3 + +ldfmaxnml s0, s1, [x2] +// CHECK-INST: ldfmaxnml s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc606041 + +ldfmaxnml s2, s3, [sp] +// CHECK-INST: ldfmaxnml s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6263e3 + +ldfmaxnml d0, d1, [x2] +// CHECK-INST: ldfmaxnml d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc606041 + +ldfmaxnml d2, d3, [sp] +// CHECK-INST: ldfmaxnml d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6263e3 + +//------------------------------------------------------------------------------ +// LDBFMAXNM +//------------------------------------------------------------------------------ + +ldbfmaxnm h0, h1, [x2] +// CHECK-INST: ldbfmaxnm h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c206041 + +ldbfmaxnm h2, h3, [sp] +// CHECK-INST: ldbfmaxnm h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2263e3 + +// -- ldbfmaxnma + +ldbfmaxnma h0, h1, [x2] +// CHECK-INST: ldbfmaxnma h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xa0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca06041 + +ldbfmaxnma h2, h3, [sp] +// CHECK-INST: ldbfmaxnma h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xa2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca263e3 + +// -- ldbfmaxnmal + +ldbfmaxnmal h0, h1, [x2] +// CHECK-INST: ldbfmaxnmal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0xe0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce06041 + +ldbfmaxnmal h2, h3, [sp] +// CHECK-INST: ldbfmaxnmal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0xe2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce263e3 + +// -- ldbfmaxnml + +ldbfmaxnml h0, h1, [x2] +// CHECK-INST: ldbfmaxnml h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x60,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c606041 + +ldbfmaxnml h2, h3, [sp] +// CHECK-INST: ldbfmaxnml h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x63,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6263e3 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmin-diagnostics.s b/llvm/test/MC/AArch64/LSFE/ldfmin-diagnostics.s new file mode 100644 index 000000000000..e50f94e8ec45 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmin-diagnostics.s @@ -0,0 +1,241 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// LDFMIN +//------------------------------------------------------------------------------ + +ldfmin h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmin h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmin s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmin s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmin d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmin d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmin d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmin d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmin s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmin s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfmina + +ldfmina h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmina h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmina s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmina s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmina d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmina d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmina d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmina d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfmina s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfmina s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfminal + +ldfminal h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminal h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminal s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminal s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminal d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminal d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminal d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminal d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminal s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminal s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfminl + +ldfminl h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminl h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminl s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminl s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminl d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminl d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminl d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminl d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminl s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminl s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// LDBFMIN +//------------------------------------------------------------------------------ + +ldbfmin s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmin h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmin s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmin d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmin h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmin h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmin h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfmina + +ldbfmina s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmina h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmina s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmina d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmina h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfmina h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfmina h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfminal + +ldbfminal s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminal h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminal s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminal d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminal h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminal h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminal h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfminl + +ldbfminl s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminl h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminl s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminl d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminl h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminl h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminl h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfmin.s b/llvm/test/MC/AArch64/LSFE/ldfmin.s new file mode 100644 index 000000000000..4867db04a69c --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfmin.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// LDFMIN +//------------------------------------------------------------------------------ + +ldfmin h0, h1, [x2] +// CHECK-INST: ldfmin h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c205041 + +ldfmin h2, h3, [sp] +// CHECK-INST: ldfmin h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2253e3 + +ldfmin s0, s1, [x2] +// CHECK-INST: ldfmin s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc205041 + +ldfmin s2, s3, [sp] +// CHECK-INST: ldfmin s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2253e3 + +ldfmin d0, d1, [x2] +// CHECK-INST: ldfmin d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc205041 + +ldfmin d2, d3, [sp] +// CHECK-INST: ldfmin d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2253e3 + +// -- ldfmina + +ldfmina h0, h1, [x2] +// CHECK-INST: ldfmina h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xa0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca05041 + +ldfmina h2, h3, [sp] +// CHECK-INST: ldfmina h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xa2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca253e3 + +ldfmina s0, s1, [x2] +// CHECK-INST: ldfmina s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xa0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca05041 + +ldfmina s2, s3, [sp] +// CHECK-INST: ldfmina s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xa2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca253e3 + +ldfmina d0, d1, [x2] +// CHECK-INST: ldfmina d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xa0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca05041 + +ldfmina d2, d3, [sp] +// CHECK-INST: ldfmina d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xa2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca253e3 + +// -- ldfminal + +ldfminal h0, h1, [x2] +// CHECK-INST: ldfminal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xe0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce05041 + +ldfminal h2, h3, [sp] +// CHECK-INST: ldfminal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xe2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce253e3 + +ldfminal s0, s1, [x2] +// CHECK-INST: ldfminal s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xe0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce05041 + +ldfminal s2, s3, [sp] +// CHECK-INST: ldfminal s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xe2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce253e3 + +ldfminal d0, d1, [x2] +// CHECK-INST: ldfminal d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xe0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce05041 + +ldfminal d2, d3, [sp] +// CHECK-INST: ldfminal d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xe2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce253e3 + +// -- ldfminl + +ldfminl h0, h1, [x2] +// CHECK-INST: ldfminl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c605041 + +ldfminl h2, h3, [sp] +// CHECK-INST: ldfminl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6253e3 + +ldfminl s0, s1, [x2] +// CHECK-INST: ldfminl s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc605041 + +ldfminl s2, s3, [sp] +// CHECK-INST: ldfminl s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6253e3 + +ldfminl d0, d1, [x2] +// CHECK-INST: ldfminl d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc605041 + +ldfminl d2, d3, [sp] +// CHECK-INST: ldfminl d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6253e3 + +//------------------------------------------------------------------------------ +// LDBFMIN +//------------------------------------------------------------------------------ + +ldbfmin h0, h1, [x2] +// CHECK-INST: ldbfmin h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c205041 + +ldbfmin h2, h3, [sp] +// CHECK-INST: ldbfmin h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2253e3 + +// -- ldbfmina + +ldbfmina h0, h1, [x2] +// CHECK-INST: ldbfmina h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xa0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca05041 + +ldbfmina h2, h3, [sp] +// CHECK-INST: ldbfmina h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xa2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca253e3 + +// -- ldbfminal + +ldbfminal h0, h1, [x2] +// CHECK-INST: ldbfminal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0xe0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce05041 + +ldbfminal h2, h3, [sp] +// CHECK-INST: ldbfminal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0xe2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce253e3 + +// -- ldbfminl + +ldbfminl h0, h1, [x2] +// CHECK-INST: ldbfminl h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x50,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c605041 + +ldbfminl h2, h3, [sp] +// CHECK-INST: ldbfminl h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x53,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6253e3 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfminnm-diagnostics.s b/llvm/test/MC/AArch64/LSFE/ldfminnm-diagnostics.s new file mode 100644 index 000000000000..e31adb515680 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfminnm-diagnostics.s @@ -0,0 +1,241 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// ldfmin +//------------------------------------------------------------------------------ + +ldfminnm h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnm h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnm s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnm s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnm d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnm d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnm d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnm d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnm s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnm s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfminnma + +ldfminnma h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnma h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnma s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnma s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnma d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnma d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnma d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnma d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnma s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnma s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfminnmal + +ldfminnmal h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnmal h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnmal s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnmal s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnmal d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnmal d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnmal d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnmal d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnmal s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnmal s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldfminnml + +ldfminnml h0, s2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnml h0, s2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnml s0, d2, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnml s0, d2, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnml d0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnml d0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnml d0, d1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnml d0, d1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldfminnml s0, s1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldfminnml s0, s1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// LDBFMINNM +//------------------------------------------------------------------------------ + +ldbfminnm s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnm h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnm s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnm d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnm h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnm h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnm h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfminnma + +ldbfminnma s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnma h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnma s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnma d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnma h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnma h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnma h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfminnmal + +ldbfminnmal s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnmal h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnmal s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnmal d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnmal h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnmal h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnmal h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- ldbfminnml + +ldbfminnml s0, h1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml s0, h1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnml h0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml h0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnml s0, s1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml s0, s1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnml d0, d1, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml d0, d1, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnml h0, h1, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml h0, h1, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ldbfminnml h0, h1, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ldbfminnml h0, h1, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/ldfminnm.s b/llvm/test/MC/AArch64/LSFE/ldfminnm.s new file mode 100644 index 000000000000..ae027aa1f8ae --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/ldfminnm.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// LDFMINNM +//------------------------------------------------------------------------------ + +ldfminnm h0, h1, [x2] +// CHECK-INST: ldfminnm h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c207041 + +ldfminnm h2, h3, [sp] +// CHECK-INST: ldfminnm h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2273e3 + +ldfminnm s0, s1, [x2] +// CHECK-INST: ldfminnm s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc207041 + +ldfminnm s2, s3, [sp] +// CHECK-INST: ldfminnm s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2273e3 + +ldfminnm d0, d1, [x2] +// CHECK-INST: ldfminnm d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc207041 + +ldfminnm d2, d3, [sp] +// CHECK-INST: ldfminnm d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2273e3 + +// -- ldfminnma + +ldfminnma h0, h1, [x2] +// CHECK-INST: ldfminnma h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xa0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca07041 + +ldfminnma h2, h3, [sp] +// CHECK-INST: ldfminnma h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xa2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ca273e3 + +ldfminnma s0, s1, [x2] +// CHECK-INST: ldfminnma s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xa0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca07041 + +ldfminnma s2, s3, [sp] +// CHECK-INST: ldfminnma s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xa2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bca273e3 + +ldfminnma d0, d1, [x2] +// CHECK-INST: ldfminnma d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xa0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca07041 + +ldfminnma d2, d3, [sp] +// CHECK-INST: ldfminnma d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xa2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fca273e3 + +// -- ldfminnmal + +ldfminnmal h0, h1, [x2] +// CHECK-INST: ldfminnmal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xe0,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce07041 + +ldfminnmal h2, h3, [sp] +// CHECK-INST: ldfminnmal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xe2,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7ce273e3 + +ldfminnmal s0, s1, [x2] +// CHECK-INST: ldfminnmal s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xe0,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce07041 + +ldfminnmal s2, s3, [sp] +// CHECK-INST: ldfminnmal s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xe2,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bce273e3 + +ldfminnmal d0, d1, [x2] +// CHECK-INST: ldfminnmal d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xe0,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce07041 + +ldfminnmal d2, d3, [sp] +// CHECK-INST: ldfminnmal d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xe2,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fce273e3 + +// -- ldfminnml + +ldfminnml h0, h1, [x2] +// CHECK-INST: ldfminnml h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c607041 + +ldfminnml h2, h3, [sp] +// CHECK-INST: ldfminnml h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6273e3 + +ldfminnml s0, s1, [x2] +// CHECK-INST: ldfminnml s0, s1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc607041 + +ldfminnml s2, s3, [sp] +// CHECK-INST: ldfminnml s2, s3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6273e3 + +ldfminnml d0, d1, [x2] +// CHECK-INST: ldfminnml d0, d1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc607041 + +ldfminnml d2, d3, [sp] +// CHECK-INST: ldfminnml d2, d3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6273e3 + +//------------------------------------------------------------------------------ +// LDBFMINNM +//------------------------------------------------------------------------------ + +ldbfminnm h0, h1, [x2] +// CHECK-INST: ldbfminnm h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c207041 + +ldbfminnm h2, h3, [sp] +// CHECK-INST: ldbfminnm h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2273e3 + +// -- ldbfminnma + +ldbfminnma h0, h1, [x2] +// CHECK-INST: ldbfminnma h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xa0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca07041 + +ldbfminnma h2, h3, [sp] +// CHECK-INST: ldbfminnma h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xa2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ca273e3 + +// -- ldbfminnmal + +ldbfminnmal h0, h1, [x2] +// CHECK-INST: ldbfminnmal h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0xe0,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce07041 + +ldbfminnmal h2, h3, [sp] +// CHECK-INST: ldbfminnmal h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0xe2,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3ce273e3 + +// -- ldbfminnml + +ldbfminnml h0, h1, [x2] +// CHECK-INST: ldbfminnml h0, h1, [x2] +// CHECK-ENCODING: [0x41,0x70,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c607041 + +ldbfminnml h2, h3, [sp] +// CHECK-INST: ldbfminnml h2, h3, [sp] +// CHECK-ENCODING: [0xe3,0x73,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6273e3 \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfadd-diagnostics.s b/llvm/test/MC/AArch64/LSFE/stfadd-diagnostics.s new file mode 100644 index 000000000000..9cfb35f7ca18 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfadd-diagnostics.s @@ -0,0 +1,73 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// STFADD +//------------------------------------------------------------------------------ + +stfadd h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfadd h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfadd s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfadd s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- stfaddl + +stfaddl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfaddl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfaddl s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfaddl s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// STBFADD +//------------------------------------------------------------------------------ + +stbfadd s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfadd s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfadd d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfadd d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfadd h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfadd h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfadd h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfadd h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// -- stbfaddl + +stbfaddl s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfaddl s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfaddl d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfaddl d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfaddl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfaddl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfaddl h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfaddl h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfadd.s b/llvm/test/MC/AArch64/LSFE/stfadd.s new file mode 100644 index 000000000000..5bf3f5eaee9e --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfadd.s @@ -0,0 +1,121 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// STFADD +//------------------------------------------------------------------------------ + +stfadd h0, [x2] +// CHECK-INST: stfadd h0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c20805f + +stfadd h2, [sp] +// CHECK-INST: stfadd h2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c2283ff + +stfadd s0, [x2] +// CHECK-INST: stfadd s0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc20805f + +stfadd s2, [sp] +// CHECK-INST: stfadd s2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc2283ff + +stfadd d0, [x2] +// CHECK-INST: stfadd d0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc20805f + +stfadd d2, [sp] +// CHECK-INST: stfadd d2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc2283ff + +// -- stfaddl + +stfaddl h0, [x2] +// CHECK-INST: stfaddl h0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c60805f + +stfaddl h2, [sp] +// CHECK-INST: stfaddl h2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c6283ff + +stfaddl s0, [x2] +// CHECK-INST: stfaddl s0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc60805f + +stfaddl s2, [sp] +// CHECK-INST: stfaddl s2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc6283ff + +stfaddl d0, [x2] +// CHECK-INST: stfaddl d0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc60805f + +stfaddl d2, [sp] +// CHECK-INST: stfaddl d2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc6283ff + +//------------------------------------------------------------------------------ +// STBFADD +//------------------------------------------------------------------------------ + +stbfadd h0, [x2] +// CHECK-INST: stbfadd h0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c20805f + +stbfadd h2, [sp] +// CHECK-INST: stbfadd h2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c2283ff + +// -- stbfaddl + +stbfaddl h0, [x2] +// CHECK-INST: stbfaddl h0, [x2] +// CHECK-ENCODING: [0x5f,0x80,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c60805f + +stbfaddl h2, [sp] +// CHECK-INST: stbfaddl h2, [sp] +// CHECK-ENCODING: [0xff,0x83,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c6283ff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfmax-diagnostics.s b/llvm/test/MC/AArch64/LSFE/stfmax-diagnostics.s new file mode 100644 index 000000000000..932ba5ccf6bb --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmax-diagnostics.s @@ -0,0 +1,73 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// STFMAX +//------------------------------------------------------------------------------ + +stfmax h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmax h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfmax s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmax s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- stfmaxl + +stfmaxl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfmaxl s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxl s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// STBFMAX +//------------------------------------------------------------------------------ + +stbfmax s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmax s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfmax d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmax d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmax h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmax h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmax h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmax h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// -- stbfmaxl + +stbfmaxl s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxl s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfmaxl d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxl d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxl h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxl h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfmax.s b/llvm/test/MC/AArch64/LSFE/stfmax.s new file mode 100644 index 000000000000..7c2c9ffd26b3 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmax.s @@ -0,0 +1,121 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// STFMAX +//------------------------------------------------------------------------------ + +stfmax h0, [x2] +// CHECK-INST: stfmax h0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c20c05f + +stfmax h2, [sp] +// CHECK-INST: stfmax h2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c22c3ff + +stfmax s0, [x2] +// CHECK-INST: stfmax s0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc20c05f + +stfmax s2, [sp] +// CHECK-INST: stfmax s2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc22c3ff + +stfmax d0, [x2] +// CHECK-INST: stfmax d0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc20c05f + +stfmax d2, [sp] +// CHECK-INST: stfmax d2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc22c3ff + +// -- stfmaxl + +stfmaxl h0, [x2] +// CHECK-INST: stfmaxl h0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c60c05f + +stfmaxl h2, [sp] +// CHECK-INST: stfmaxl h2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c62c3ff + +stfmaxl s0, [x2] +// CHECK-INST: stfmaxl s0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc60c05f + +stfmaxl s2, [sp] +// CHECK-INST: stfmaxl s2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc62c3ff + +stfmaxl d0, [x2] +// CHECK-INST: stfmaxl d0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc60c05f + +stfmaxl d2, [sp] +// CHECK-INST: stfmaxl d2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc62c3ff + +//------------------------------------------------------------------------------ +// STBFMAX +//------------------------------------------------------------------------------ + +stbfmax h0, [x2] +// CHECK-INST: stbfmax h0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c20c05f + +stbfmax h2, [sp] +// CHECK-INST: stbfmax h2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c22c3ff + +// -- stbfmaxl + +stbfmaxl h0, [x2] +// CHECK-INST: stbfmaxl h0, [x2] +// CHECK-ENCODING: [0x5f,0xc0,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c60c05f + +stbfmaxl h2, [sp] +// CHECK-INST: stbfmaxl h2, [sp] +// CHECK-ENCODING: [0xff,0xc3,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c62c3ff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfmaxnm-diagnostics.s b/llvm/test/MC/AArch64/LSFE/stfmaxnm-diagnostics.s new file mode 100644 index 000000000000..db9ff49b1661 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmaxnm-diagnostics.s @@ -0,0 +1,73 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// STFMAXNM +//------------------------------------------------------------------------------ + +stfmaxnm h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxnm h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfmaxnm s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxnm s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- stfmaxnml + +stfmaxnml h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxnml h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfmaxnml s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmaxnml s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// STBFMAXNM +//------------------------------------------------------------------------------ + +stbfmaxnm s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnm s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfmaxnm d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnm d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxnm h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnm h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxnm h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnm h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// --stbfmaxnml + +stbfmaxnml s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnml s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfmaxnml d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnml d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxnml h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnml h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmaxnml h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmaxnml h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfmaxnm.s b/llvm/test/MC/AArch64/LSFE/stfmaxnm.s new file mode 100644 index 000000000000..3f544fda98c2 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmaxnm.s @@ -0,0 +1,121 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// STFMAXNM +//------------------------------------------------------------------------------ + +stfmaxnm h0, [x2] +// CHECK-INST: stfmaxnm h0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c20e05f + +stfmaxnm h2, [sp] +// CHECK-INST: stfmaxnm h2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c22e3ff + +stfmaxnm s0, [x2] +// CHECK-INST: stfmaxnm s0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc20e05f + +stfmaxnm s2, [sp] +// CHECK-INST: stfmaxnm s2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc22e3ff + +stfmaxnm d0, [x2] +// CHECK-INST: stfmaxnm d0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc20e05f + +stfmaxnm d2, [sp] +// CHECK-INST: stfmaxnm d2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc22e3ff + +// -- stfmaxnml + +stfmaxnml h0, [x2] +// CHECK-INST: stfmaxnml h0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c60e05f + +stfmaxnml h2, [sp] +// CHECK-INST: stfmaxnml h2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c62e3ff + +stfmaxnml s0, [x2] +// CHECK-INST: stfmaxnml s0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc60e05f + +stfmaxnml s2, [sp] +// CHECK-INST: stfmaxnml s2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc62e3ff + +stfmaxnml d0, [x2] +// CHECK-INST: stfmaxnml d0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc60e05f + +stfmaxnml d2, [sp] +// CHECK-INST: stfmaxnml d2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc62e3ff + +//------------------------------------------------------------------------------ +// STBFMAXNM +//------------------------------------------------------------------------------ + +stbfmaxnm h0, [x2] +// CHECK-INST: stbfmaxnm h0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c20e05f + +stbfmaxnm h2, [sp] +// CHECK-INST: stbfmaxnm h2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c22e3ff + +// -- stbfmaxnml + +stbfmaxnml h0, [x2] +// CHECK-INST: stbfmaxnml h0, [x2] +// CHECK-ENCODING: [0x5f,0xe0,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c60e05f + +stbfmaxnml h2, [sp] +// CHECK-INST: stbfmaxnml h2, [sp] +// CHECK-ENCODING: [0xff,0xe3,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c62e3ff diff --git a/llvm/test/MC/AArch64/LSFE/stfmin-diagnostics.s b/llvm/test/MC/AArch64/LSFE/stfmin-diagnostics.s new file mode 100644 index 000000000000..111c1b56c181 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmin-diagnostics.s @@ -0,0 +1,73 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// STFMIN +//------------------------------------------------------------------------------ + +stfmin h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmin h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfmin s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfmin s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- stfminl + +stfminl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfminl s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminl s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// STBFMIN +//------------------------------------------------------------------------------ + +stbfmin s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmin s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfmin d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmin d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmin h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmin h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfmin h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfmin h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// -- stbfminl + +stbfminl s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminl s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfminl d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminl d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminl h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminl h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminl h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminl h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfmin.s b/llvm/test/MC/AArch64/LSFE/stfmin.s new file mode 100644 index 000000000000..b94689931c95 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfmin.s @@ -0,0 +1,121 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// STFMIN +//------------------------------------------------------------------------------ + +stfmin h0, [x2] +// CHECK-INST: stfmin h0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c20d05f + +stfmin h2, [sp] +// CHECK-INST: stfmin h2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c22d3ff + +stfmin s0, [x2] +// CHECK-INST: stfmin s0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc20d05f + +stfmin s2, [sp] +// CHECK-INST: stfmin s2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc22d3ff + +stfmin d0, [x2] +// CHECK-INST: stfmin d0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc20d05f + +stfmin d2, [sp] +// CHECK-INST: stfmin d2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc22d3ff + +// --stfminl + +stfminl h0, [x2] +// CHECK-INST: stfminl h0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c60d05f + +stfminl h2, [sp] +// CHECK-INST: stfminl h2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c62d3ff + +stfminl s0, [x2] +// CHECK-INST: stfminl s0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc60d05f + +stfminl s2, [sp] +// CHECK-INST: stfminl s2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc62d3ff + +stfminl d0, [x2] +// CHECK-INST: stfminl d0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc60d05f + +stfminl d2, [sp] +// CHECK-INST: stfminl d2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc62d3ff + +//------------------------------------------------------------------------------ +// STBFMIN +//------------------------------------------------------------------------------ + +stbfmin h0, [x2] +// CHECK-INST: stbfmin h0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c20d05f + +stbfmin h2, [sp] +// CHECK-INST: stbfmin h2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c22d3ff + +// -- stbfminl + +stbfminl h0, [x2] +// CHECK-INST: stbfminl h0, [x2] +// CHECK-ENCODING: [0x5f,0xd0,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c60d05f + +stbfminl h2, [sp] +// CHECK-INST: stbfminl h2, [sp] +// CHECK-ENCODING: [0xff,0xd3,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c62d3ff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfminnm-diagnostics.s b/llvm/test/MC/AArch64/LSFE/stfminnm-diagnostics.s new file mode 100644 index 000000000000..cfaae50b17ad --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfminnm-diagnostics.s @@ -0,0 +1,73 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe 2>&1 < %s| FileCheck %s + +//------------------------------------------------------------------------------ +// STFMINNM +//------------------------------------------------------------------------------ + +stfminnm h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminnm h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfminnm s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminnm s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// -- stfminnml + +stfminnml h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminnml h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stfminnml s0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stfminnml s0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +//------------------------------------------------------------------------------ +// STBFMINNM +//------------------------------------------------------------------------------ + +stbfminnm s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnm s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfminnm d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnm d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminnm h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnm h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminnm h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnm h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// -- stbfminnml + +stbfminnml s0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnml s0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stbfminnml d0, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnml d0, [x2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminnml h0, [w2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnml h0, [w2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +stbfminnml h0, [x2, #4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stbfminnml h0, [x2, #4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} \ No newline at end of file diff --git a/llvm/test/MC/AArch64/LSFE/stfminnm.s b/llvm/test/MC/AArch64/LSFE/stfminnm.s new file mode 100644 index 000000000000..20c429cca588 --- /dev/null +++ b/llvm/test/MC/AArch64/LSFE/stfminnm.s @@ -0,0 +1,121 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=+lsfe - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsfe < %s \ +// RUN: | llvm-objdump -d --mattr=-lsfe - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsfe < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+lsfe -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +//------------------------------------------------------------------------------ +// STFMINNM +//------------------------------------------------------------------------------ + +stfminnm h0, [x2] +// CHECK-INST: stfminnm h0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x20,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c20f05f + +stfminnm h2, [sp] +// CHECK-INST: stfminnm h2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x22,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c22f3ff + +stfminnm s0, [x2] +// CHECK-INST: stfminnm s0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x20,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc20f05f + +stfminnm s2, [sp] +// CHECK-INST: stfminnm s2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x22,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc22f3ff + +stfminnm d0, [x2] +// CHECK-INST: stfminnm d0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x20,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc20f05f + +stfminnm d2, [sp] +// CHECK-INST: stfminnm d2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x22,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc22f3ff + +// -- stfminnml + +stfminnml h0, [x2] +// CHECK-INST: stfminnml h0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x60,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c60f05f + +stfminnml h2, [sp] +// CHECK-INST: stfminnml h2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x62,0x7c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 7c62f3ff + +stfminnml s0, [x2] +// CHECK-INST: stfminnml s0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x60,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc60f05f + +stfminnml s2, [sp] +// CHECK-INST: stfminnml s2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x62,0xbc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: bc62f3ff + +stfminnml d0, [x2] +// CHECK-INST: stfminnml d0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x60,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc60f05f + +stfminnml d2, [sp] +// CHECK-INST: stfminnml d2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x62,0xfc] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: fc62f3ff + +//------------------------------------------------------------------------------ +// STBFMINNM +//------------------------------------------------------------------------------ + +stbfminnm h0, [x2] +// CHECK-INST: stbfminnm h0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x20,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c20f05f + +stbfminnm h2, [sp] +// CHECK-INST: stbfminnm h2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x22,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c22f3ff + +// -- stbfminnml + +stbfminnml h0, [x2] +// CHECK-INST: stbfminnml h0, [x2] +// CHECK-ENCODING: [0x5f,0xf0,0x60,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c60f05f + +stbfminnml h2, [sp] +// CHECK-INST: stbfminnml h2, [sp] +// CHECK-ENCODING: [0xff,0xf3,0x62,0x3c] +// CHECK-ERROR: instruction requires: lsfe +// CHECK-UNKNOWN: 3c62f3ff \ No newline at end of file -- GitLab From a2ba438f3e5635e368333213914c7452a6a6a2da Mon Sep 17 00:00:00 2001 From: XChy Date: Mon, 21 Oct 2024 22:30:31 +0800 Subject: [PATCH 242/511] [InstCombine] Preserve the flag from RHS only if the `and` is bitwise (#113164) Fixes #113123 Alive proof: https://alive2.llvm.org/ce/z/hnqeLC --- .../InstCombine/InstCombineAndOrXor.cpp | 6 ++- .../InstCombine/fcmp-range-check-idiom.ll | 39 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index d72013ba223d..835eae9f5725 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1531,8 +1531,10 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, } if (IsLessThanOrLessEqual(IsAnd ? PredL : PredR)) { BuilderTy::FastMathFlagGuard Guard(Builder); - Builder.setFastMathFlags(LHS->getFastMathFlags() | - RHS->getFastMathFlags()); + FastMathFlags NewFlag = LHS->getFastMathFlags(); + if (!IsLogicalSelect) + NewFlag |= RHS->getFastMathFlags(); + Builder.setFastMathFlags(NewFlag); Value *FAbs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, LHS0); return Builder.CreateFCmp(PredL, FAbs, diff --git a/llvm/test/Transforms/InstCombine/fcmp-range-check-idiom.ll b/llvm/test/Transforms/InstCombine/fcmp-range-check-idiom.ll index 10a3ccf3cdb4..54dbb09cb8fd 100644 --- a/llvm/test/Transforms/InstCombine/fcmp-range-check-idiom.ll +++ b/llvm/test/Transforms/InstCombine/fcmp-range-check-idiom.ll @@ -359,3 +359,42 @@ define i1 @test_and_olt_fmf_propagation_union(float %x) { %cond = and i1 %cmp1, %cmp2 ret i1 %cond } + +define i1 @test_and_olt_fmf_propagation_union_logical_rhs_poison(float %x) { +; CHECK-LABEL: define i1 @test_and_olt_fmf_propagation_union_logical_rhs_poison( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COND:%.*]] = fcmp olt float [[TMP1]], 0x3C00000000000000 +; CHECK-NEXT: ret i1 [[COND]] +; + %cmp1 = fcmp ninf olt float %x, 0x3C00000000000000 + %cmp2 = fcmp ogt float %x, 0xBC00000000000000 + %cond = select i1 %cmp2, i1 %cmp1, i1 false + ret i1 %cond +} + +define i1 @test_and_olt_fmf_propagation_union_logical_lhs_poison(float %x) { +; CHECK-LABEL: define i1 @test_and_olt_fmf_propagation_union_logical_lhs_poison( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call ninf float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COND:%.*]] = fcmp ninf olt float [[TMP1]], 0x3C00000000000000 +; CHECK-NEXT: ret i1 [[COND]] +; + %cmp1 = fcmp olt float %x, 0x3C00000000000000 + %cmp2 = fcmp ninf ogt float %x, 0xBC00000000000000 + %cond = select i1 %cmp2, i1 %cmp1, i1 false + ret i1 %cond +} + +define i1 @test_and_olt_fmf_propagation_union_logical_both_poison(float %x) { +; CHECK-LABEL: define i1 @test_and_olt_fmf_propagation_union_logical_both_poison( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call ninf float @llvm.fabs.f32(float [[X]]) +; CHECK-NEXT: [[COND:%.*]] = fcmp ninf olt float [[TMP1]], 0x3C00000000000000 +; CHECK-NEXT: ret i1 [[COND]] +; + %cmp1 = fcmp ninf olt float %x, 0x3C00000000000000 + %cmp2 = fcmp ninf ogt float %x, 0xBC00000000000000 + %cond = select i1 %cmp2, i1 %cmp1, i1 false + ret i1 %cond +} -- GitLab From 1e07c4800cf46ac9e4748b664cb58cbd48acb918 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Mon, 21 Oct 2024 07:48:56 -0700 Subject: [PATCH 243/511] [rtsan][NFC] Rename *interceptors.cpp to *interceptors_posix.cpp (#112935) Done in preparation of exploring rtsan on windows. --- compiler-rt/lib/rtsan/CMakeLists.txt | 2 +- .../{rtsan_interceptors.cpp => rtsan_interceptors_posix.cpp} | 0 compiler-rt/lib/rtsan/tests/CMakeLists.txt | 2 +- ..._test_interceptors.cpp => rtsan_test_interceptors_posix.cpp} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename compiler-rt/lib/rtsan/{rtsan_interceptors.cpp => rtsan_interceptors_posix.cpp} (100%) rename compiler-rt/lib/rtsan/tests/{rtsan_test_interceptors.cpp => rtsan_test_interceptors_posix.cpp} (100%) diff --git a/compiler-rt/lib/rtsan/CMakeLists.txt b/compiler-rt/lib/rtsan/CMakeLists.txt index f8dd4d735bc2..a4413d9992b6 100644 --- a/compiler-rt/lib/rtsan/CMakeLists.txt +++ b/compiler-rt/lib/rtsan/CMakeLists.txt @@ -5,7 +5,7 @@ set(RTSAN_CXX_SOURCES rtsan_context.cpp rtsan_diagnostics.cpp rtsan_flags.cpp - rtsan_interceptors.cpp + rtsan_interceptors_posix.cpp rtsan_stats.cpp rtsan_suppressions.cpp ) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp similarity index 100% rename from compiler-rt/lib/rtsan/rtsan_interceptors.cpp rename to compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp diff --git a/compiler-rt/lib/rtsan/tests/CMakeLists.txt b/compiler-rt/lib/rtsan/tests/CMakeLists.txt index 139eea785fcd..0cf07b307d46 100644 --- a/compiler-rt/lib/rtsan/tests/CMakeLists.txt +++ b/compiler-rt/lib/rtsan/tests/CMakeLists.txt @@ -16,7 +16,7 @@ set(RTSAN_UNITTEST_CFLAGS set(RTSAN_INST_TEST_SOURCES rtsan_test_functional.cpp - rtsan_test_interceptors.cpp + rtsan_test_interceptors_posix.cpp rtsan_test_main.cpp) set(RTSAN_NOINST_TEST_SOURCES diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp similarity index 100% rename from compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp rename to compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp -- GitLab From 4679583181a9032b4f7c6476c7a1bfefe5724b47 Mon Sep 17 00:00:00 2001 From: SpencerAbson Date: Mon, 21 Oct 2024 15:49:24 +0100 Subject: [PATCH 244/511] [LLVM][AArch64] Add register classes for Armv9.6 assembly (#111717) Add new register classes/operands and their encoder/decoder behaviour required for the new Armv9.6 instructions (see https://developer.arm.com/documentation/109697/2024_09/Feature-descriptions/The-Armv9-6-architecture-extension). This work is the basis ofthe 2024 Armv9.6 architecture update effort for SME. Co-authored-by: Caroline Concatto caroline.concatto@arm.com Co-authored-by: Marian Lukac marian.lukac@arm.com Co-authored-by: Momchil Velikov momchil.velikov@arm.com --- .../lib/Target/AArch64/AArch64RegisterInfo.td | 149 ++++++++++++++---- .../AArch64/AsmParser/AArch64AsmParser.cpp | 108 ++++++++++--- .../Disassembler/AArch64Disassembler.cpp | 43 ++++- .../MCTargetDesc/AArch64MCCodeEmitter.cpp | 37 +++-- .../AArch64/GlobalISel/regbank-inlineasm.mir | 8 +- llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 14 +- .../emit_fneg_with_non_register_operand.mir | 8 +- llvm/test/CodeGen/AArch64/fmlal-loreg.ll | 2 +- .../CodeGen/AArch64/peephole-insvigpr.mir | 4 +- 9 files changed, 294 insertions(+), 79 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index f754c32e1176..8516ab2c7dd7 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -923,10 +923,9 @@ class ZPRRegOp : RegisterClass< - "AArch64", +class PPRClass : RegisterClass<"AArch64", [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16, - (sequence "P%u", firstreg, lastreg)> { + (sequence "P%u", firstreg, lastreg, step)> { let Size = 16; } @@ -940,6 +939,8 @@ def PPR_p8to15 : PPRClass<8, 15> { let DecoderMethod = "DecodeSimpleRegisterClass"; } +def PPRMul2 : PPRClass<0, 14, 2>; + class PPRAsmOperand : AsmOperandClass { let Name = "SVE" # name # "Reg"; let PredicateMethod = "isSVEPredicateVectorRegOfWidth<" @@ -1098,10 +1099,11 @@ class PPRVectorListMul : PPRVectorList"; + # ElementWidth # + ", AArch64::PPRMul2RegClassID>"; } -let EncoderMethod = "EncodeRegAsMultipleOf<2>", +let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 14>", DecoderMethod = "DecodePPR2Mul2RegisterClass" in { def PP_b_mul_r : RegisterOperand"> { let ParserMatchClass = PPRVectorListMul<8, 2>; @@ -1124,23 +1126,28 @@ let EncoderMethod = "EncodeRegAsMultipleOf<2>", //****************************************************************************** // SVE vector register classes -class ZPRClass : RegisterClass<"AArch64", +class ZPRClass : RegisterClass<"AArch64", [nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], - 128, (sequence "Z%u", 0, lastreg)> { + 128, (sequence "Z%u", firstreg, lastreg, step)> { let Size = 128; } -def ZPR : ZPRClass<31> { +def ZPRMul2 : ZPRClass<0, 30, 2>; +def ZPRMul4 : ZPRClass<0, 28, 4>; +def ZPRMul2_Lo : ZPRClass<0, 14, 2>; +def ZPRMul2_Hi : ZPRClass<16, 30, 2>; + +def ZPR : ZPRClass<0, 31> { let DecoderMethod = "DecodeSimpleRegisterClass"; } -def ZPR_4b : ZPRClass<15> { // Restricted 4 bit SVE vector register class. +def ZPR_4b : ZPRClass<0, 15> { // Restricted 4 bit SVE vector register class. let DecoderMethod = "DecodeSimpleRegisterClass"; } -def ZPR_3b : ZPRClass<7> { // Restricted 3 bit SVE vector register class. +def ZPR_3b : ZPRClass<0, 7> { // Restricted 3 bit SVE vector register class. let DecoderMethod = "DecodeSimpleRegisterClass"; } @@ -1188,6 +1195,39 @@ def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ElementSizeH, ZPR_4b>; def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ElementSizeS, ZPR_4b>; def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ElementSizeD, ZPR_4b>; +class ZPRMul2_MinToMaxRegOp + : ZPRRegOp { + let EncoderMethod = "EncodeRegMul_MinMax<2," # Min # ", " # Max # ">"; + let DecoderMethod = "DecodeZPRMul2_MinMax<" # Min # ", " # Max # ">"; +} + +def ZPRMul2AsmOp8_Lo : ZPRAsmOperand<"VectorB_Lo", 8, "Mul2_Lo">; +def ZPRMul2AsmOp8_Hi : ZPRAsmOperand<"VectorB_Hi", 8, "Mul2_Hi">; +def ZPRMul2AsmOp16_Lo : ZPRAsmOperand<"VectorH_Lo", 16, "Mul2_Lo">; +def ZPRMul2AsmOp16_Hi : ZPRAsmOperand<"VectorH_Hi", 16, "Mul2_Hi">; +def ZPRMul2AsmOp32_Lo : ZPRAsmOperand<"VectorS_Lo", 32, "Mul2_Lo">; +def ZPRMul2AsmOp32_Hi : ZPRAsmOperand<"VectorS_Hi", 32, "Mul2_Hi">; +def ZPRMul2AsmOp64_Lo : ZPRAsmOperand<"VectorD_Lo", 64, "Mul2_Lo">; +def ZPRMul2AsmOp64_Hi : ZPRAsmOperand<"VectorD_Hi", 64, "Mul2_Hi">; + +def ZPR_K : RegisterClass<"AArch64", [untyped], 128, + (add Z20, Z21, Z22, Z23, Z28, Z29, Z30, Z31)>; + +def ZK : RegisterOperand">{ + let EncoderMethod = "EncodeZK"; + let DecoderMethod = "DecodeZK"; + let ParserMatchClass = ZPRAsmOperand<"Vector_20to23or28to31", 0, "_K">; +} + +def ZPR8Mul2_Lo : ZPRMul2_MinToMaxRegOp<"b", ZPRMul2AsmOp8_Lo, 0, 14, ElementSizeB, ZPRMul2_Lo>; +def ZPR8Mul2_Hi : ZPRMul2_MinToMaxRegOp<"b", ZPRMul2AsmOp8_Hi, 16, 30, ElementSizeB, ZPRMul2_Hi>; +def ZPR16Mul2_Lo : ZPRMul2_MinToMaxRegOp<"h", ZPRMul2AsmOp16_Lo, 0, 14, ElementSizeH, ZPRMul2_Lo>; +def ZPR16Mul2_Hi : ZPRMul2_MinToMaxRegOp<"h", ZPRMul2AsmOp16_Hi, 16, 30, ElementSizeH, ZPRMul2_Hi>; +def ZPR32Mul2_Lo : ZPRMul2_MinToMaxRegOp<"s", ZPRMul2AsmOp32_Lo, 0, 14, ElementSizeS, ZPRMul2_Lo>; +def ZPR32Mul2_Hi : ZPRMul2_MinToMaxRegOp<"s", ZPRMul2AsmOp32_Hi, 16, 30, ElementSizeS, ZPRMul2_Hi>; +def ZPR64Mul2_Lo : ZPRMul2_MinToMaxRegOp<"d", ZPRMul2AsmOp64_Lo, 0, 14, ElementSizeD, ZPRMul2_Lo>; +def ZPR64Mul2_Hi : ZPRMul2_MinToMaxRegOp<"d", ZPRMul2AsmOp64_Hi, 16, 30, ElementSizeD, ZPRMul2_Hi>; + class FPRasZPR : AsmOperandClass{ let Name = "FPR" # Width # "asZPR"; let PredicateMethod = "isFPRasZPR"; @@ -1327,64 +1367,117 @@ def ZPR4Mul4 : RegisterClass<"AArch64", [untyped], 128, (add (decimate ZSeqQuads let Size = 512; } -class ZPRVectorListMul : ZPRVectorList { - let Name = "SVEVectorListMul" # NumRegs # "x" # ElementWidth; +class ZPRVectorListMul + : ZPRVectorList { + let Name = "SVEVectorList" # NumRegs # "x" # ElementWidth # RegClassSuffix; let DiagnosticType = "Invalid" # Name; let PredicateMethod = - "isTypedVectorListMultiple"; + "isTypedVectorListMultiple"; } -let EncoderMethod = "EncodeRegAsMultipleOf<2>", - DecoderMethod = "DecodeZPR2Mul2RegisterClass" in { +let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 30>", + DecoderMethod = "DecodeZPR2Mul2RegisterClass<0, 30>" in { def ZZ_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<0, 2>; + let ParserMatchClass = ZPRVectorListMul<0, 2, "Mul2">; } def ZZ_b_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<8, 2>; + let ParserMatchClass = ZPRVectorListMul<8, 2, "Mul2">; } def ZZ_h_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<16, 2>; + let ParserMatchClass = ZPRVectorListMul<16, 2, "Mul2">; } def ZZ_s_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<32, 2>; + let ParserMatchClass = ZPRVectorListMul<32, 2, "Mul2">; } def ZZ_d_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<64, 2>; + let ParserMatchClass = ZPRVectorListMul<64, 2, "Mul2">; } def ZZ_q_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<128, 2>; + let ParserMatchClass = ZPRVectorListMul<128, 2, "Mul2">; } } // end let EncoderMethod/DecoderMethod -let EncoderMethod = "EncodeRegAsMultipleOf<4>", +let EncoderMethod = "EncodeRegMul_MinMax<4, 0, 28>", DecoderMethod = "DecodeZPR4Mul4RegisterClass" in { def ZZZZ_b_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<8, 4>; + let ParserMatchClass = ZPRVectorListMul<8, 4, "Mul4">; } def ZZZZ_h_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<16, 4>; + let ParserMatchClass = ZPRVectorListMul<16, 4, "Mul4">; } def ZZZZ_s_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<32, 4>; + let ParserMatchClass = ZPRVectorListMul<32, 4, "Mul4">; } def ZZZZ_d_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<64, 4>; + let ParserMatchClass = ZPRVectorListMul<64, 4, "Mul4">; } def ZZZZ_q_mul_r : RegisterOperand"> { - let ParserMatchClass = ZPRVectorListMul<128, 4>; + let ParserMatchClass = ZPRVectorListMul<128, 4, "Mul4">; } } // end let EncoderMethod/DecoderMethod +// Pairs of consecutive ZPR, starting with an even register, split into +// Lo=0-14 and Hi=16-30. +def ZPR2Mul2_Lo : RegisterClass<"AArch64", [untyped], 128, + (trunc (decimate ZSeqPairs, 2), 8)> { + let Size = 256; +} + +def ZPR2Mul2_Hi : RegisterClass<"AArch64", [untyped], 128, + (trunc (rotr (decimate ZSeqPairs, 2), 8), 8)> { + let Size = 256; +} + +let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 14>", + DecoderMethod = "DecodeZPR2Mul2RegisterClass<0, 16>" in { + def ZZ_b_mul_r_Lo : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<8, 2, "Mul2_Lo">; + } + + def ZZ_h_mul_r_Lo : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<16, 2, "Mul2_Lo">; + } + + def ZZ_s_mul_r_Lo : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<32, 2, "Mul2_Lo">; + } + + def ZZ_d_mul_r_Lo : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<64, 2, "Mul2_Lo">; + } +} + +let EncoderMethod = "EncodeRegMul_MinMax<2, 16, 30>", + DecoderMethod = "DecodeZPR2Mul2RegisterClass<16, 31>" in { + def ZZ_b_mul_r_Hi : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<8, 2, "Mul2_Hi">; + } + + def ZZ_h_mul_r_Hi : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<16, 2, "Mul2_Hi">; + } + + def ZZ_s_mul_r_Hi : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<32, 2, "Mul2_Hi">; + } + + def ZZ_d_mul_r_Hi : RegisterOperand"> { + let ParserMatchClass = ZPRVectorListMul<64, 2, "Mul2_Hi">; + } + } // end let EncoderMethod/DecoderMethod + // SME2 strided multi-vector operands // ZStridedPairs diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index a5165d45893f..d0d2fda23a58 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1262,6 +1262,9 @@ public: case AArch64::ZPRRegClassID: case AArch64::ZPR_3bRegClassID: case AArch64::ZPR_4bRegClassID: + case AArch64::ZPRMul2_LoRegClassID: + case AArch64::ZPRMul2_HiRegClassID: + case AArch64::ZPR_KRegClassID: RK = RegKind::SVEDataVector; break; case AArch64::PPRRegClassID: @@ -1442,13 +1445,13 @@ public: } template + unsigned ElementWidth, unsigned RegClass> DiagnosticPredicate isTypedVectorListMultiple() const { bool Res = isTypedVectorList(); if (!Res) return DiagnosticPredicateTy::NoMatch; - if (((VectorList.RegNum - AArch64::Z0) % NumRegs) != 0) + if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.RegNum)) return DiagnosticPredicateTy::NearMatch; return DiagnosticPredicateTy::Match; } @@ -6092,6 +6095,33 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "Invalid restricted vector register, expected z0.s..z15.s"); case Match_InvalidZPR_4b64: return Error(Loc, "Invalid restricted vector register, expected z0.d..z15.d"); + case Match_InvalidZPRMul2_Lo8: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z0.b..z14.b"); + case Match_InvalidZPRMul2_Hi8: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z16.b..z30.b"); + case Match_InvalidZPRMul2_Lo16: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z0.h..z14.h"); + case Match_InvalidZPRMul2_Hi16: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z16.h..z30.h"); + case Match_InvalidZPRMul2_Lo32: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z0.s..z14.s"); + case Match_InvalidZPRMul2_Hi32: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z16.s..z30.s"); + case Match_InvalidZPRMul2_Lo64: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z0.d..z14.d"); + case Match_InvalidZPRMul2_Hi64: + return Error(Loc, "Invalid restricted vector register, expected even " + "register in z16.d..z30.d"); + case Match_InvalidZPR_K0: + return Error(Loc, "invalid restricted vector register, expected register " + "in z20..z23 or z28..z31"); case Match_InvalidSVEPattern: return Error(Loc, "invalid predicate pattern"); case Match_InvalidSVEPPRorPNRAnyReg: @@ -6171,19 +6201,36 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "operand must be a register in range [w12, w15]"); case Match_InvalidMatrixIndexGPR32_8_11: return Error(Loc, "operand must be a register in range [w8, w11]"); - case Match_InvalidSVEVectorListMul2x8: - case Match_InvalidSVEVectorListMul2x16: - case Match_InvalidSVEVectorListMul2x32: - case Match_InvalidSVEVectorListMul2x64: - case Match_InvalidSVEVectorListMul2x128: + case Match_InvalidSVEVectorList2x8Mul2: + case Match_InvalidSVEVectorList2x16Mul2: + case Match_InvalidSVEVectorList2x32Mul2: + case Match_InvalidSVEVectorList2x64Mul2: + case Match_InvalidSVEVectorList2x128Mul2: return Error(Loc, "Invalid vector list, expected list with 2 consecutive " "SVE vectors, where the first vector is a multiple of 2 " "and with matching element types"); - case Match_InvalidSVEVectorListMul4x8: - case Match_InvalidSVEVectorListMul4x16: - case Match_InvalidSVEVectorListMul4x32: - case Match_InvalidSVEVectorListMul4x64: - case Match_InvalidSVEVectorListMul4x128: + case Match_InvalidSVEVectorList2x8Mul2_Lo: + case Match_InvalidSVEVectorList2x16Mul2_Lo: + case Match_InvalidSVEVectorList2x32Mul2_Lo: + case Match_InvalidSVEVectorList2x64Mul2_Lo: + return Error(Loc, "Invalid vector list, expected list with 2 consecutive " + "SVE vectors in the range z0-z14, where the first vector " + "is a multiple of 2 " + "and with matching element types"); + case Match_InvalidSVEVectorList2x8Mul2_Hi: + case Match_InvalidSVEVectorList2x16Mul2_Hi: + case Match_InvalidSVEVectorList2x32Mul2_Hi: + case Match_InvalidSVEVectorList2x64Mul2_Hi: + return Error(Loc, + "Invalid vector list, expected list with 2 consecutive " + "SVE vectors in the range z16-z30, where the first vector " + "is a multiple of 2 " + "and with matching element types"); + case Match_InvalidSVEVectorList4x8Mul4: + case Match_InvalidSVEVectorList4x16Mul4: + case Match_InvalidSVEVectorList4x32Mul4: + case Match_InvalidSVEVectorList4x64Mul4: + case Match_InvalidSVEVectorList4x128Mul4: return Error(Loc, "Invalid vector list, expected list with 4 consecutive " "SVE vectors, where the first vector is a multiple of 4 " "and with matching element types"); @@ -6776,16 +6823,33 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMatrixIndexGPR32_12_15: case Match_InvalidMatrixIndexGPR32_8_11: case Match_InvalidLookupTable: - case Match_InvalidSVEVectorListMul2x8: - case Match_InvalidSVEVectorListMul2x16: - case Match_InvalidSVEVectorListMul2x32: - case Match_InvalidSVEVectorListMul2x64: - case Match_InvalidSVEVectorListMul2x128: - case Match_InvalidSVEVectorListMul4x8: - case Match_InvalidSVEVectorListMul4x16: - case Match_InvalidSVEVectorListMul4x32: - case Match_InvalidSVEVectorListMul4x64: - case Match_InvalidSVEVectorListMul4x128: + case Match_InvalidZPRMul2_Lo8: + case Match_InvalidZPRMul2_Hi8: + case Match_InvalidZPRMul2_Lo16: + case Match_InvalidZPRMul2_Hi16: + case Match_InvalidZPRMul2_Lo32: + case Match_InvalidZPRMul2_Hi32: + case Match_InvalidZPRMul2_Lo64: + case Match_InvalidZPRMul2_Hi64: + case Match_InvalidZPR_K0: + case Match_InvalidSVEVectorList2x8Mul2: + case Match_InvalidSVEVectorList2x16Mul2: + case Match_InvalidSVEVectorList2x32Mul2: + case Match_InvalidSVEVectorList2x64Mul2: + case Match_InvalidSVEVectorList2x128Mul2: + case Match_InvalidSVEVectorList4x8Mul4: + case Match_InvalidSVEVectorList4x16Mul4: + case Match_InvalidSVEVectorList4x32Mul4: + case Match_InvalidSVEVectorList4x64Mul4: + case Match_InvalidSVEVectorList4x128Mul4: + case Match_InvalidSVEVectorList2x8Mul2_Lo: + case Match_InvalidSVEVectorList2x16Mul2_Lo: + case Match_InvalidSVEVectorList2x32Mul2_Lo: + case Match_InvalidSVEVectorList2x64Mul2_Lo: + case Match_InvalidSVEVectorList2x8Mul2_Hi: + case Match_InvalidSVEVectorList2x16Mul2_Hi: + case Match_InvalidSVEVectorList2x32Mul2_Hi: + case Match_InvalidSVEVectorList2x64Mul2_Hi: case Match_InvalidSVEVectorListStrided2x8: case Match_InvalidSVEVectorListStrided2x16: case Match_InvalidSVEVectorListStrided2x32: diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index b97f00c99311..4a4b89da7188 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -45,6 +45,13 @@ static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); +template +static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address, + const MCDisassembler *Decoder); +template static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -355,13 +362,45 @@ DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, return Success; } +template +static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Reg = (RegNo * 2) + Min; + if (Reg < Min || Reg > Max || (Reg & 1)) + return Fail; + unsigned Register = + AArch64MCRegisterClasses[AArch64::ZPRRegClassID].getRegister(Reg); + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +template static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { - if (RegNo * 2 > 30) + unsigned Reg = (RegNo * 2) + Min; + if (Reg < Min || Reg > Max || (Reg & 1)) + return Fail; + + unsigned Register = + AArch64MCRegisterClasses[AArch64::ZPR2RegClassID].getRegister(Reg); + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +// Zk Is the name of the control vector register Z20-Z23 or Z28-Z31, encoded in +// the "K:Zk" fields. Z20-Z23 = 000, 001,010, 011 and Z28-Z31 = 100, 101, 110, +// 111 +static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address, + const MCDisassembler *Decoder) { + // RegNo < 4 => Reg is in Z20-Z23 (offset 20) + // RegNo >= 4 => Reg is in Z28-Z31 (offset 24) + unsigned Reg = (RegNo < 4) ? (RegNo + 20) : (RegNo + 24); + if (!(Reg >= 20 && Reg <= 23) && !(Reg >= 28 && Reg <= 31)) return Fail; unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR2RegClassID].getRegister(RegNo * 2); + AArch64MCRegisterClasses[AArch64::ZPRRegClassID].getRegister(Reg); Inst.addOperand(MCOperand::createReg(Register)); return Success; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index c3e12b6d8024..85ffb8639dad 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -191,10 +191,13 @@ public: unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const; - template - uint32_t EncodeRegAsMultipleOf(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; + template + uint32_t EncodeRegMul_MinMax(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t EncodeZK(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; uint32_t EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -561,15 +564,31 @@ AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, return MO.getImm() - 8; } -template +template uint32_t -AArch64MCCodeEmitter::EncodeRegAsMultipleOf(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +AArch64MCCodeEmitter::EncodeRegMul_MinMax(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { assert(llvm::isPowerOf2_32(Multiple) && "Multiple is not a power of 2"); auto RegOpnd = MI.getOperand(OpIdx).getReg(); unsigned RegVal = Ctx.getRegisterInfo()->getEncodingValue(RegOpnd); - return RegVal / Multiple; + assert(RegVal >= Min && RegVal <= Max && (RegVal & (Multiple - 1)) == 0); + return (RegVal - Min) / Multiple; +} + +// Zk Is the name of the control vector register Z20-Z23 or Z28-Z31, encoded in +// the "K:Zk" fields. Z20-Z23 = 000, 001,010, 011 and Z28-Z31 = 100, 101, 110, +// 111 +uint32_t AArch64MCCodeEmitter::EncodeZK(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + auto RegOpnd = MI.getOperand(OpIdx).getReg(); + unsigned RegVal = Ctx.getRegisterInfo()->getEncodingValue(RegOpnd); + // Z28 => RegVal = 28 (28 - 24 = 4) Z28 = 4 + if (RegOpnd > AArch64::Z27) + return (RegVal - 24); + // Z20 => RegVal = 20 (20 -20 = 0) Z20 = 0 + return (RegVal - 20); } uint32_t diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir index 2ffb78568068..f1d1b691fe1a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir @@ -57,11 +57,11 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_reg_output - ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1769482 /* regdef:GPR32common */, def %0 + ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 - INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1769482 /* regdef:GPR32common */, def %0:gpr32common + INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0:gpr32common %1:_(s32) = COPY %0 $w0 = COPY %1(s32) RET_ReallyLR implicit $w0 @@ -75,12 +75,12 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_mixed_types - ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1769482 /* regdef:GPR32common */, def %0, {{[0-9]+}} /* regdef:FPR64 */, def %1 + ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0, 3342346 /* regdef:FPR64 */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr(s64) = COPY %1 ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64) ; CHECK-NEXT: RET_ReallyLR implicit $d0 - INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1769482 /* regdef:GPR32common */, def %0:gpr32common, 2621450 /* regdef:FPR64 */, def %1:fpr64 + INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0:gpr32common, 3342346 /* regdef:FPR64 */, def %1:fpr64 %3:_(s32) = COPY %0 %4:_(s64) = COPY %1 $d0 = COPY %4(s64) diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll index 068e194779c1..9f8897575b3d 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll @@ -13,7 +13,7 @@ define @test_svadd_i8( %Zn, asm "add $0.b, $1.b, $2.b", "=w,w,y"( %Zn, %Zm) @@ -29,7 +29,7 @@ define @test_svsub_i64( %Zn, asm "sub $0.d, $1.d, $2.d", "=w,w,x"( %Zn, %Zm) @@ -45,7 +45,7 @@ define @test_svfmul_f16( %Zn, asm "fmul $0.h, $1.h, $2.h", "=w,w,y"( %Zn, %Zm) @@ -61,7 +61,7 @@ define @test_svfmul_f( %Zn, asm "fmul $0.s, $1.s, $2.s", "=w,w,x"( %Zn, %Zm) @@ -79,7 +79,7 @@ define @test_svfadd_f16( %Pg, asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Upl,w,w"( %Pg, %Zn, %Zm) @@ -95,7 +95,7 @@ define @test_incp( %Pg, ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ppr = COPY $p0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5767178 /* regdef:ZPR */, def %2, 458761 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: $z0 = COPY %2 ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "incp $0.s, $1", "=w,@3Upa,0"( %Pg, %Zn) @@ -113,7 +113,7 @@ define @test_svfadd_f16_Uph_constraint( %P ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 655369 /* reguse:PPR_p8to15 */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]] + ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5767178 /* regdef:ZPR */, def %3, 786441 /* reguse:PPR_p8to15 */, [[COPY3]], 5767177 /* reguse:ZPR */, [[COPY4]], 5767177 /* reguse:ZPR */, [[COPY5]] ; CHECK-NEXT: $z0 = COPY %3 ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"( %Pg, %Zn, %Zm) diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir index 2be7aba2a3df..ffa7453e48b4 100644 --- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir +++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir @@ -91,10 +91,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2621450 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY %2 ; CHECK-NEXT: [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2621450 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) ; CHECK-NEXT: [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2 ; CHECK-NEXT: nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr ; CHECK-NEXT: Bcc 1, %bb.2, implicit $nzcv @@ -111,10 +111,10 @@ body: | %6:gpr64common = LOADgot target-flags(aarch64-got) @c %3:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c) - INLINEASM &"", 1 /* sideeffect attdialect */, 2621450 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, %3(tied-def 3) %0:fpr64 = COPY %2 %5:fpr64 = LDRDui %6, 0 :: (dereferenceable load (s64) from @c) - INLINEASM &"", 1 /* sideeffect attdialect */, 2621450 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, %5(tied-def 3) %7:fpr64 = FNEGDr %2 nofpexcept FCMPDrr %4, killed %7, implicit-def $nzcv, implicit $fpcr Bcc 1, %bb.2, implicit $nzcv diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll index 20737a731839..31ead890ba8a 100644 --- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll +++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll @@ -11,8 +11,8 @@ define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: ldr q8, [x0] ; CHECK-NEXT: ldr q16, [x1] +; CHECK-NEXT: ldr q8, [x0] ; CHECK-NEXT: lsr x9, x8, #32 ; CHECK-NEXT: //APP ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir index 5dd29cf39c0e..f8af5b963701 100644 --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -487,7 +487,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]] - ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2621450 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] + ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3342346 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF @@ -505,7 +505,7 @@ body: | %0:gpr64common = COPY $x0 %2:gpr64all = IMPLICIT_DEF %3:gpr64sp = COPY %2 - INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2621450 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed %3 + INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3342346 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed %3 %4:fpr128 = MOVIv2d_ns 0 %5:fpr64 = COPY %4.dsub %7:fpr128 = IMPLICIT_DEF -- GitLab From e7302319b52e3d231216d54d10622b0698928a96 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Mon, 21 Oct 2024 10:53:14 -0400 Subject: [PATCH 245/511] [mlir] Fix shared build. NFC --- mlir/test/lib/Pass/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt index 9f79944ff896..f489b7e51e50 100644 --- a/mlir/test/lib/Pass/CMakeLists.txt +++ b/mlir/test/lib/Pass/CMakeLists.txt @@ -1,3 +1,4 @@ +get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) # Exclude tests from libMLIR.so add_mlir_library(MLIRTestPass TestDynamicPipeline.cpp @@ -10,6 +11,7 @@ add_mlir_library(MLIRTestPass ${MLIR_MAIN_INCLUDE_DIR}/mlir/Pass LINK_LIBS PUBLIC + ${conversion_libs} MLIRIR MLIRPass MLIRTestDialect -- GitLab From 17e9752267ed9c81c8da87f3a6d0e01f130b0d04 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Mon, 21 Oct 2024 08:10:22 -0700 Subject: [PATCH 246/511] Revert "[mlir][mlir-spirv-cpu-runner] Move MLIR pass pipeline to mlir-opt" (#113176) Reverts llvm/llvm-project#111575 This caused build failures: https://lab.llvm.org/buildbot/#/builders/138/builds/5244 --- mlir/test/lib/Pass/CMakeLists.txt | 1 - .../lib/Pass/TestSPIRVCPURunnerPipeline.cpp | 47 ------------------- mlir/test/mlir-spirv-cpu-runner/double.mlir | 3 +- .../mlir-spirv-cpu-runner/simple_add.mlir | 3 +- mlir/tools/mlir-opt/mlir-opt.cpp | 2 - .../mlir-spirv-cpu-runner.cpp | 24 ++++++++++ 6 files changed, 26 insertions(+), 54 deletions(-) delete mode 100644 mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt index f489b7e51e50..b190f054e50b 100644 --- a/mlir/test/lib/Pass/CMakeLists.txt +++ b/mlir/test/lib/Pass/CMakeLists.txt @@ -3,7 +3,6 @@ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) add_mlir_library(MLIRTestPass TestDynamicPipeline.cpp TestPassManager.cpp - TestSPIRVCPURunnerPipeline.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp b/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp deleted file mode 100644 index ded0d22c3130..000000000000 --- a/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//===------------------ TestSPIRVCPURunnerPipeline.cpp --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implements a pipeline for use by mlir-spirv-cpu-runner tests. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h" -#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h" -#include "mlir/Dialect/GPU/Transforms/Passes.h" -#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" -#include "mlir/Dialect/SPIRV/Transforms/Passes.h" -#include "mlir/Pass/PassManager.h" - -using namespace mlir; - -namespace { - -void buildTestSPIRVCPURunnerPipeline(OpPassManager &passManager) { - passManager.addPass(createGpuKernelOutliningPass()); - passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true)); - - OpPassManager &nestedPM = passManager.nest(); - nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass()); - nestedPM.addPass(spirv::createSPIRVUpdateVCEPass()); - passManager.addPass(createLowerHostCodeToLLVMPass()); - passManager.addPass(createConvertSPIRVToLLVMPass()); -} - -} // namespace - -namespace mlir { -namespace test { -void registerTestSPIRVCPURunnerPipeline() { - PassPipelineRegistration<>( - "test-spirv-cpu-runner-pipeline", - "Runs a series of passes for lowering SPIR-V-dialect MLIR to " - "LLVM-dialect MLIR intended for mlir-spirv-cpu-runner.", - buildTestSPIRVCPURunnerPipeline); -} -} // namespace test -} // namespace mlir diff --git a/mlir/test/mlir-spirv-cpu-runner/double.mlir b/mlir/test/mlir-spirv-cpu-runner/double.mlir index 35557ba1e94c..cd551ffb1bd0 100644 --- a/mlir/test/mlir-spirv-cpu-runner/double.mlir +++ b/mlir/test/mlir-spirv-cpu-runner/double.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \ -// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ +// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ // RUN: | FileCheck %s // CHECK: [8, 8, 8, 8, 8, 8] diff --git a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir index 75675a69a675..119e973e45e4 100644 --- a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir +++ b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \ -// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ +// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \ // RUN: | FileCheck %s // CHECK: data = diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 002c3900056d..36b142484bb0 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -142,7 +142,6 @@ void registerTestSCFWhileOpBuilderPass(); void registerTestSCFWrapInZeroTripCheckPasses(); void registerTestShapeMappingPass(); void registerTestSliceAnalysisPass(); -void registerTestSPIRVCPURunnerPipeline(); void registerTestSPIRVFuncSignatureConversion(); void registerTestSPIRVVectorUnrolling(); void registerTestTensorCopyInsertionPass(); @@ -279,7 +278,6 @@ void registerTestPasses() { mlir::test::registerTestSCFWrapInZeroTripCheckPasses(); mlir::test::registerTestShapeMappingPass(); mlir::test::registerTestSliceAnalysisPass(); - mlir::test::registerTestSPIRVCPURunnerPipeline(); mlir::test::registerTestSPIRVFuncSignatureConversion(); mlir::test::registerTestSPIRVVectorUnrolling(); mlir::test::registerTestTensorCopyInsertionPass(); diff --git a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp index 22ad1024db4a..7e0b51cac806 100644 --- a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp +++ b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp @@ -12,12 +12,18 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h" +#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" +#include "mlir/Dialect/SPIRV/Transforms/Passes.h" #include "mlir/ExecutionEngine/JitRunner.h" #include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/Pass/Pass.h" @@ -69,6 +75,23 @@ convertMLIRModule(Operation *op, llvm::LLVMContext &context) { return mainModule; } +static LogicalResult runMLIRPasses(Operation *module, + JitRunnerOptions &options) { + PassManager passManager(module->getContext(), + module->getName().getStringRef()); + if (failed(applyPassManagerCLOptions(passManager))) + return failure(); + passManager.addPass(createGpuKernelOutliningPass()); + passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true)); + + OpPassManager &nestedPM = passManager.nest(); + nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass()); + nestedPM.addPass(spirv::createSPIRVUpdateVCEPass()); + passManager.addPass(createLowerHostCodeToLLVMPass()); + passManager.addPass(createConvertSPIRVToLLVMPass()); + return passManager.run(module); +} + int main(int argc, char **argv) { llvm::InitLLVM y(argc, argv); @@ -76,6 +99,7 @@ int main(int argc, char **argv) { llvm::InitializeNativeTargetAsmPrinter(); mlir::JitRunnerConfig jitRunnerConfig; + jitRunnerConfig.mlirTransformer = runMLIRPasses; jitRunnerConfig.llvmModuleBuilder = convertMLIRModule; mlir::DialectRegistry registry; -- GitLab From c5ca1b8626db71fa7ac5d851fa3a0710641136ff Mon Sep 17 00:00:00 2001 From: Zaara Syeda Date: Mon, 21 Oct 2024 11:13:16 -0400 Subject: [PATCH 247/511] [PPC] Add custom lowering for uaddo (#110137) Improve the codegen for uaddo node for i64 in 64-bit mode and i32 in 32-bit mode by custom lowering. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 42 ++++++++++++++ llvm/lib/Target/PowerPC/PPCISelLowering.h | 1 + llvm/test/CodeGen/PowerPC/sat-add.ll | 5 +- llvm/test/CodeGen/PowerPC/uaddo-32.ll | 38 +++++++++++++ llvm/test/CodeGen/PowerPC/uaddo-64.ll | 62 +++++++++++++++++++++ 5 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/uaddo-32.ll create mode 100644 llvm/test/CodeGen/PowerPC/uaddo-64.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5d6c7c729a76..cb0c8bade670 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -198,6 +198,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } + setOperationAction(ISD::UADDO, isPPC64 ? MVT::i64 : MVT::i32, Custom); + // Match BITREVERSE to customized fast code sequence in the td file. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); @@ -11967,11 +11969,51 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("ERROR:Should return for all cases within swtich."); } +SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const { + // Default to target independent lowering if there is a logical user of the + // carry-bit. + for (SDNode *U : Op->uses()) { + if (U->getOpcode() == ISD::SELECT) + return SDValue(); + if (ISD::isBitwiseLogicOp(U->getOpcode())) { + for (unsigned i = 0, ie = U->getNumOperands(); i != ie; ++i) { + if (U->getOperand(i).getOpcode() != ISD::UADDO && + U->getOperand(i).getOpcode() != ISD::MERGE_VALUES) + return SDValue(); + } + } + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); + + // Default to target independent lowering for special cases handled there. + if (isOneConstant(RHS) || isAllOnesConstant(RHS)) + return SDValue(); + + EVT VT = Op.getNode()->getValueType(0); + + SDValue ADDC; + SDValue Overflow; + SDVTList VTs = Op.getNode()->getVTList(); + + ADDC = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), LHS, RHS); + Overflow = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(VT, MVT::Glue), + DAG.getConstant(0, dl, VT), DAG.getConstant(0, dl, VT), + ADDC.getValue(1)); + SDValue OverflowTrunc = + DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); + SDValue Res = + DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc); + return Res; +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::UADDO: return LowerUaddo(Op, DAG); case ISD::FPOW: return lowerPow(Op, DAG); case ISD::FSIN: return lowerSin(Op, DAG); case ISD::FCOS: return lowerCos(Op, DAG); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 8c7961e641c3..0adbad868459 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1277,6 +1277,7 @@ namespace llvm { SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index f699ea54192d..8fff2c28da24 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -170,11 +170,10 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, -43 ; CHECK-NEXT: addi 4, 3, 42 -; CHECK-NEXT: cmpld 3, 5 +; CHECK-NEXT: cmpld 4, 3 ; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: iselgt 3, 3, 4 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, -43 diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll new file mode 100644 index 000000000000..b5989fc2ee2d --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s + +define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + %2 = extractvalue { i32, i1 } %0, 0 + %3 = zext i1 %1 to i32 + store i32 %3, ptr %ovf, align 8 + ret i32 %2 +} + +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) + +define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + %2 = extractvalue { i32, i1 } %0, 0 + store i32 %2, ptr %ovf, align 8 + ret i1 %1 +} diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll new file mode 100644 index 000000000000..736b54e23d25 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s + +define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr +entry: + %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %1 = extractvalue { i64, i1 } %0, 1 + %2 = extractvalue { i64, i1 } %0, 0 + %3 = zext i1 %1 to i64 + store i64 %3, ptr %ovf, align 8 + ret i64 %2 +} + +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) + +define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr +entry: + %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %1 = extractvalue { i64, i1 } %0, 1 + %2 = extractvalue { i64, i1 } %0, 0 + store i64 %2, ptr %ovf, align 8 + ret i1 %1 +} + +define noundef i64 @addWithCarryIn (i64 noundef %a, i64 noundef %b, i64 noundef %c, ptr nocapture noundef writeonly %ovf) { +; CHECK-LABEL: addWithCarryIn: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 7 +; CHECK-NEXT: addc 3, 3, 5 +; CHECK-NEXT: addze 5, 7 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: std 4, 0(6) +; CHECK-NEXT: blr +entry: + %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %1 = extractvalue { i64, i1 } %0, 1 + %2 = extractvalue { i64, i1 } %0, 0 + %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %2, i64 %c) + %4 = extractvalue { i64, i1 } %3, 1 + %5 = extractvalue { i64, i1 } %3, 0 + %6 = or i1 %1, %4 + %7 = zext i1 %6 to i64 + store i64 %7, ptr %ovf, align 8 + ret i64 %5 +} -- GitLab From 900b6369e2f5fbc229371a142fdcd28b5280dbc0 Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Mon, 21 Oct 2024 11:19:26 -0400 Subject: [PATCH 248/511] [AIX][test] XFAIL constant folding log1p test Test added by commit 47a6da2d4dc7d996eb2678243ac566822d59e483 fails on the AIX bot. So XFAIL for now to investigate further. --- llvm/test/Transforms/InstCombine/log1p.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/InstCombine/log1p.ll b/llvm/test/Transforms/InstCombine/log1p.ll index 81d3cc8a4f7a..bbf89db8c341 100644 --- a/llvm/test/Transforms/InstCombine/log1p.ll +++ b/llvm/test/Transforms/InstCombine/log1p.ll @@ -1,3 +1,4 @@ +; XFAIL: target={{.*}}-aix{{.*}} ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s -- GitLab From 120e42d3135f558b5e0a73da1c6484571eeff941 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Mon, 21 Oct 2024 08:51:36 -0700 Subject: [PATCH 249/511] [MemProf] Improve metadata cleanup in LTO backend (#113039) Previously we were attempting to remove the memprof-related metadata when iterating through instructions in the LTO backend. However, we missed some as there are a number of cases where we skip instructions, or even entire functions. Simplify the cleanup and ensure all is removed by doing a full sweep over all instructions after completing cloning. This is largely NFC except with -memprof-report-hinted-sizes enabled, because we were propagating and simplifying the metadata after inlining in the LTO backend, which caused some stray messages as metadata was re-converted to attributes. --- .../IPO/MemProfContextDisambiguation.cpp | 23 ++++++++++++++----- llvm/test/ThinLTO/X86/memprof-icp.ll | 5 +++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5ade0db343f2..4efd683dfca3 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -4264,9 +4264,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { AllocVersionsThinBackend++; if (!MaxAllocVersionsThinBackend) MaxAllocVersionsThinBackend = 1; - // Remove any remaining callsite metadata and we can skip the rest of - // the handling for this instruction, since no cloning needed. - I.setMetadata(LLVMContext::MD_callsite, nullptr); continue; } @@ -4419,9 +4416,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { CloneCallsite(Callsite->second, CB, CalledFunction); } } - // Memprof and callsite metadata on memory allocations no longer needed. - I.setMetadata(LLVMContext::MD_memprof, nullptr); - I.setMetadata(LLVMContext::MD_callsite, nullptr); } } @@ -4429,6 +4423,23 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE); } + // We skip some of the functions and instructions above, so remove all the + // metadata in a single sweep here. + for (auto &F : M) { + // We can skip memprof clones because createFunctionClones already strips + // the metadata from the newly created clones. + if (F.isDeclaration() || isMemProfClone(F)) + continue; + for (auto &BB : F) { + for (auto &I : BB) { + if (!isa(I)) + continue; + I.setMetadata(LLVMContext::MD_memprof, nullptr); + I.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + } + return Changed; } diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll index 2e976794425b..f17e19e1f77e 100644 --- a/llvm/test/ThinLTO/X86/memprof-icp.ll +++ b/llvm/test/ThinLTO/X86/memprof-icp.ll @@ -176,7 +176,10 @@ ; RUN: -pass-remarks=. -save-temps \ ; RUN: -o %t.noicp.out 2>&1 | FileCheck %s --implicit-check-not "created clone" -; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof" +;; Verify that we did not do any cloning of the function with the indirect call +;; when memprof ICP is off. However, we should still have removed the callsite +;; metadata. +; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof" --implicit-check-not "!callsite" ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 -- GitLab From 8417f6af54c8f6dcf5893ab1352b50bf33c5a1ba Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 21 Oct 2024 17:51:42 +0200 Subject: [PATCH 250/511] [win/asan] Fix instruction size for 44 0f b6 1a movzx r11d,BYTE PTR [rdx] is four bytes long. Follow-up to #111638 --- compiler-rt/lib/interception/interception_win.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index 4f60d4251303..077a536dd2a3 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -769,6 +769,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { switch (*(u32*)(address)) { case 0x1ab60f44: // 44 0f b6 1a : movzx r11d, BYTE PTR [rdx] + return 4; case 0x24448b48: // 48 8b 44 24 XX : mov rax, QWORD ptr [rsp + XX] case 0x246c8948: // 48 89 6C 24 XX : mov QWORD ptr [rsp + XX], rbp case 0x245c8948: // 48 89 5c 24 XX : mov QWORD PTR [rsp + XX], rbx -- GitLab From 42ba452aa94e4da277842d8990ad958a6256e558 Mon Sep 17 00:00:00 2001 From: Spencer Abson Date: Mon, 21 Oct 2024 15:47:10 +0000 Subject: [PATCH 251/511] [NFC] Fix -WError for unused Encode/Decode ZK methods Remove the unused functions and register classes from the change below https://github.com/llvm/llvm-project/commit/4679583181a9032b4f7c6476c7a1bfefe5724b47 --- llvm/lib/Target/AArch64/AArch64RegisterInfo.td | 9 --------- .../AArch64/AsmParser/AArch64AsmParser.cpp | 5 ----- .../Disassembler/AArch64Disassembler.cpp | 18 ------------------ .../MCTargetDesc/AArch64MCCodeEmitter.cpp | 18 ------------------ .../AArch64/GlobalISel/regbank-inlineasm.mir | 4 ++-- llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 14 +++++++------- .../emit_fneg_with_non_register_operand.mir | 4 ++-- llvm/test/CodeGen/AArch64/fmlal-loreg.ll | 2 +- .../test/CodeGen/AArch64/peephole-insvigpr.mir | 2 +- 9 files changed, 13 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 8516ab2c7dd7..7f629a78fb44 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1210,15 +1210,6 @@ def ZPRMul2AsmOp32_Hi : ZPRAsmOperand<"VectorS_Hi", 32, "Mul2_Hi">; def ZPRMul2AsmOp64_Lo : ZPRAsmOperand<"VectorD_Lo", 64, "Mul2_Lo">; def ZPRMul2AsmOp64_Hi : ZPRAsmOperand<"VectorD_Hi", 64, "Mul2_Hi">; -def ZPR_K : RegisterClass<"AArch64", [untyped], 128, - (add Z20, Z21, Z22, Z23, Z28, Z29, Z30, Z31)>; - -def ZK : RegisterOperand">{ - let EncoderMethod = "EncodeZK"; - let DecoderMethod = "DecodeZK"; - let ParserMatchClass = ZPRAsmOperand<"Vector_20to23or28to31", 0, "_K">; -} - def ZPR8Mul2_Lo : ZPRMul2_MinToMaxRegOp<"b", ZPRMul2AsmOp8_Lo, 0, 14, ElementSizeB, ZPRMul2_Lo>; def ZPR8Mul2_Hi : ZPRMul2_MinToMaxRegOp<"b", ZPRMul2AsmOp8_Hi, 16, 30, ElementSizeB, ZPRMul2_Hi>; def ZPR16Mul2_Lo : ZPRMul2_MinToMaxRegOp<"h", ZPRMul2AsmOp16_Lo, 0, 14, ElementSizeH, ZPRMul2_Lo>; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d0d2fda23a58..72b9f252a718 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1264,7 +1264,6 @@ public: case AArch64::ZPR_4bRegClassID: case AArch64::ZPRMul2_LoRegClassID: case AArch64::ZPRMul2_HiRegClassID: - case AArch64::ZPR_KRegClassID: RK = RegKind::SVEDataVector; break; case AArch64::PPRRegClassID: @@ -6119,9 +6118,6 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, case Match_InvalidZPRMul2_Hi64: return Error(Loc, "Invalid restricted vector register, expected even " "register in z16.d..z30.d"); - case Match_InvalidZPR_K0: - return Error(Loc, "invalid restricted vector register, expected register " - "in z20..z23 or z28..z31"); case Match_InvalidSVEPattern: return Error(Loc, "invalid predicate pattern"); case Match_InvalidSVEPPRorPNRAnyReg: @@ -6831,7 +6827,6 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidZPRMul2_Hi32: case Match_InvalidZPRMul2_Lo64: case Match_InvalidZPRMul2_Hi64: - case Match_InvalidZPR_K0: case Match_InvalidSVEVectorList2x8Mul2: case Match_InvalidSVEVectorList2x16Mul2: case Match_InvalidSVEVectorList2x32Mul2: diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 4a4b89da7188..87c4245b5535 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -49,8 +49,6 @@ template static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); template static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -389,22 +387,6 @@ static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -// Zk Is the name of the control vector register Z20-Z23 or Z28-Z31, encoded in -// the "K:Zk" fields. Z20-Z23 = 000, 001,010, 011 and Z28-Z31 = 100, 101, 110, -// 111 -static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder) { - // RegNo < 4 => Reg is in Z20-Z23 (offset 20) - // RegNo >= 4 => Reg is in Z28-Z31 (offset 24) - unsigned Reg = (RegNo < 4) ? (RegNo + 20) : (RegNo + 24); - if (!(Reg >= 20 && Reg <= 23) && !(Reg >= 28 && Reg <= 31)) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPRRegClassID].getRegister(Reg); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 85ffb8639dad..61b838476669 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -195,9 +195,6 @@ public: uint32_t EncodeRegMul_MinMax(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; - uint32_t EncodeZK(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; uint32_t EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -576,21 +573,6 @@ AArch64MCCodeEmitter::EncodeRegMul_MinMax(const MCInst &MI, unsigned OpIdx, return (RegVal - Min) / Multiple; } -// Zk Is the name of the control vector register Z20-Z23 or Z28-Z31, encoded in -// the "K:Zk" fields. Z20-Z23 = 000, 001,010, 011 and Z28-Z31 = 100, 101, 110, -// 111 -uint32_t AArch64MCCodeEmitter::EncodeZK(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - auto RegOpnd = MI.getOperand(OpIdx).getReg(); - unsigned RegVal = Ctx.getRegisterInfo()->getEncodingValue(RegOpnd); - // Z28 => RegVal = 28 (28 - 24 = 4) Z28 = 4 - if (RegOpnd > AArch64::Z27) - return (RegVal - 24); - // Z20 => RegVal = 20 (20 -20 = 0) Z20 = 0 - return (RegVal - 20); -} - uint32_t AArch64MCCodeEmitter::EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir index f1d1b691fe1a..7186b3de442b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir @@ -57,7 +57,7 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_reg_output - ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0 + ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 2490378 /* regdef:FPR32_with_hsub_in_FPR16_lo */, def %0 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: $w0 = COPY [[COPY]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 @@ -75,7 +75,7 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: inlineasm_virt_mixed_types - ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2490378 /* regdef:GPR32common */, def %0, 3342346 /* regdef:FPR64 */, def %1 + ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 2490378 /* regdef:FPR32_with_hsub_in_FPR16_lo */, def %0, 3342346 /* regdef:GPR64 */, def %1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr(s64) = COPY %1 ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64) diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll index 9f8897575b3d..2d12c08eb8ee 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll @@ -13,7 +13,7 @@ define @test_svadd_i8( %Zn, asm "add $0.b, $1.b, $2.b", "=w,w,y"( %Zn, %Zm) @@ -29,7 +29,7 @@ define @test_svsub_i64( %Zn, asm "sub $0.d, $1.d, $2.d", "=w,w,x"( %Zn, %Zm) @@ -45,7 +45,7 @@ define @test_svfmul_f16( %Zn, asm "fmul $0.h, $1.h, $2.h", "=w,w,y"( %Zn, %Zm) @@ -61,7 +61,7 @@ define @test_svfmul_f( %Zn, asm "fmul $0.s, $1.s, $2.s", "=w,w,x"( %Zn, %Zm) @@ -79,7 +79,7 @@ define @test_svfadd_f16( %Pg, asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Upl,w,w"( %Pg, %Zn, %Zm) @@ -95,7 +95,7 @@ define @test_incp( %Pg, ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ppr = COPY $p0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5767178 /* regdef:ZPR */, def %2, 458761 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5701642 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: $z0 = COPY %2 ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "incp $0.s, $1", "=w,@3Upa,0"( %Pg, %Zn) @@ -113,7 +113,7 @@ define @test_svfadd_f16_Uph_constraint( %P ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5767178 /* regdef:ZPR */, def %3, 786441 /* reguse:PPR_p8to15 */, [[COPY3]], 5767177 /* reguse:ZPR */, [[COPY4]], 5767177 /* reguse:ZPR */, [[COPY5]] + ; CHECK-NEXT: INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5701642 /* regdef:ZPR */, def %3, 720905 /* reguse:PPR_p8to15 */, [[COPY3]], 5701641 /* reguse:ZPR */, [[COPY4]], 5701641 /* reguse:ZPR */, [[COPY5]] ; CHECK-NEXT: $z0 = COPY %3 ; CHECK-NEXT: RET_ReallyLR implicit $z0 %1 = tail call asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"( %Pg, %Zn, %Zm) diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir index ffa7453e48b4..260f81641770 100644 --- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir +++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir @@ -91,10 +91,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:GPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3) ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY %2 ; CHECK-NEXT: [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3342346 /* regdef:GPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3) ; CHECK-NEXT: [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2 ; CHECK-NEXT: nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr ; CHECK-NEXT: Bcc 1, %bb.2, implicit $nzcv diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll index 31ead890ba8a..20737a731839 100644 --- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll +++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll @@ -11,8 +11,8 @@ define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: ldr q16, [x1] ; CHECK-NEXT: ldr q8, [x0] +; CHECK-NEXT: ldr q16, [x1] ; CHECK-NEXT: lsr x9, x8, #32 ; CHECK-NEXT: //APP ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir index f8af5b963701..822083bbf8d5 100644 --- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -487,7 +487,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]] - ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3342346 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] + ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 3342346 /* regdef:GPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]] ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF -- GitLab From 9e03920cbf946e7ba282e99213707643a23ae5fb Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 21 Oct 2024 08:57:36 -0700 Subject: [PATCH 252/511] [SLP]Ignore root gather node, when searching for reuses Root gather/buildvector node should be ignored when SLP vectorizer tries to find matching gather nodes, vectorized earlier. This node is definitely the last one in the pipeline and it does not have users. It may cause the compiler crash Fixes #113143 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../X86/root-gather-reused-scalar.ll | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/root-gather-reused-scalar.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1098bf578d2d..a11e3f3815cb 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12466,7 +12466,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // Build a list of tree entries where V is used. SmallPtrSet VToTEs; for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { - if (TEPtr == TE) + if (TEPtr == TE || TEPtr->Idx == 0) continue; assert(any_of(TEPtr->Scalars, [&](Value *V) { return GatheredScalars.contains(V); }) && diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-gather-reused-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/root-gather-reused-scalar.ll new file mode 100644 index 000000000000..7850bb89c8e4 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/root-gather-reused-scalar.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %a, i32 %0, i32 %1, i1 %cmp1) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[A:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i1 [[CMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TOBOOL10_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[CMP4_3:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[CMP4_3]], [[TOBOOL10_NOT]] +; CHECK-NEXT: [[CMP2_2:%.*]] = xor i1 [[TOBOOL10_NOT]], true +; CHECK-NEXT: [[CONV3_2:%.*]] = zext i1 [[CMP2_2]] to i32 +; CHECK-NEXT: [[CMP4_2:%.*]] = icmp ne i32 [[TMP2]], [[CONV3_2]] +; CHECK-NEXT: [[CMP2_1:%.*]] = xor i1 [[CMP1]], true +; CHECK-NEXT: [[CONV3_1:%.*]] = zext i1 [[CMP2_1]] to i32 +; CHECK-NEXT: [[CMP4_1:%.*]] = icmp ne i32 [[TMP2]], [[CONV3_1]] +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i1 [[CMP4_2]], i1 false +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i1 [[CMP4_1]], i1 false +; CHECK-NEXT: [[AND_3:%.*]] = zext i1 [[TMP5]] to i32 +; CHECK-NEXT: store i32 [[AND_3]], ptr [[A]], align 4 +; CHECK-NEXT: ret void +; +entry: + %2 = load i32, ptr %a, align 4 + %tobool10.not = icmp eq i32 %0, 0 + %cmp4.3 = icmp ne i32 %1, 0 + %3 = and i1 %cmp4.3, %tobool10.not + %cmp2.2 = xor i1 %tobool10.not, true + %conv3.2 = zext i1 %cmp2.2 to i32 + %cmp4.2 = icmp ne i32 %2, %conv3.2 + %cmp2.1 = xor i1 %cmp1, true + %conv3.1 = zext i1 %cmp2.1 to i32 + %cmp4.1 = icmp ne i32 %2, %conv3.1 + %4 = select i1 %3, i1 %cmp4.2, i1 false + %5 = select i1 %4, i1 %cmp4.1, i1 false + %and.3 = zext i1 %5 to i32 + store i32 %and.3, ptr %a, align 4 + ret void +} -- GitLab From 54c93aabec965469fe7db1f4391a190e3d640feb Mon Sep 17 00:00:00 2001 From: vporpo Date: Mon, 21 Oct 2024 09:17:46 -0700 Subject: [PATCH 253/511] [SandboxVec][Legality] Scaffolding for Legality (#112623) This patch adds a LegalityResultWithReason class for describing the reason why legality decided not to vectorize the code. --- .../Vectorize/SandboxVectorizer/Legality.h | 87 ++++++++++++++++++- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/SandboxVectorizer/Legality.cpp | 39 +++++++++ .../SandboxVectorizer/Passes/BottomUpVec.cpp | 4 + .../SandboxVectorizer/LegalityTest.cpp | 21 +++++ 5 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 50fa56c5b219..233abf3efd64 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -13,6 +13,8 @@ #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_LEGALITY_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" namespace llvm::sandboxir { @@ -20,9 +22,38 @@ class LegalityAnalysis; class Value; enum class LegalityResultID { + Pack, ///> Collect scalar values. Widen, ///> Vectorize by combining scalars to a vector. }; +/// The reason for vectorizing or not vectorizing. +enum class ResultReason { + DiffOpcodes, + DiffTypes, +}; + +#ifndef NDEBUG +struct ToStr { + static const char *getLegalityResultID(LegalityResultID ID) { + switch (ID) { + case LegalityResultID::Pack: + return "Pack"; + case LegalityResultID::Widen: + return "Widen"; + } + } + + static const char *getVecReason(ResultReason Reason) { + switch (Reason) { + case ResultReason::DiffOpcodes: + return "DiffOpcodes"; + case ResultReason::DiffTypes: + return "DiffTypes"; + } + } +}; +#endif // NDEBUG + /// The legality outcome is represented by a class rather than an enum class /// because in some cases the legality checks are expensive and look for a /// particular instruction that can be passed along to the vectorizer to avoid @@ -35,7 +66,34 @@ protected: friend class LegalityAnalysis; public: + virtual ~LegalityResult() {} LegalityResultID getSubclassID() const { return ID; } +#ifndef NDEBUG + virtual void print(raw_ostream &OS) const { + OS << ToStr::getLegalityResultID(ID); + } + LLVM_DUMP_METHOD void dump() const; + friend raw_ostream &operator<<(raw_ostream &OS, const LegalityResult &LR) { + LR.print(OS); + return OS; + } +#endif // NDEBUG +}; + +/// Base class for results with reason. +class LegalityResultWithReason : public LegalityResult { + ResultReason Reason; + LegalityResultWithReason(LegalityResultID ID, ResultReason Reason) + : LegalityResult(ID), Reason(Reason) {} + friend class Pack; // For constructor. + +public: +#ifndef NDEBUG + void print(raw_ostream &OS) const override { + LegalityResult::print(OS); + OS << " Reason: " << ToStr::getVecReason(Reason); + } +#endif }; class Widen final : public LegalityResult { @@ -48,14 +106,37 @@ public: } }; +class Pack final : public LegalityResultWithReason { + Pack(ResultReason Reason) + : LegalityResultWithReason(LegalityResultID::Pack, Reason) {} + friend class LegalityAnalysis; // For constructor. + +public: + static bool classof(const LegalityResult *From) { + return From->getSubclassID() == LegalityResultID::Pack; + } +}; + /// Performs the legality analysis and returns a LegalityResult object. class LegalityAnalysis { + /// Owns the legality result objects created by createLegalityResult(). + SmallVector> ResultPool; + /// Checks opcodes, types and other IR-specifics and returns a ResultReason + /// object if not vectorizable, or nullptr otherwise. + std::optional + notVectorizableBasedOnOpcodesAndTypes(ArrayRef Bndl); + public: LegalityAnalysis() = default; - LegalityResult canVectorize(ArrayRef Bndl) { - // TODO: For now everything is legal. - return Widen(); + /// A LegalityResult factory. + template + ResultT &createLegalityResult(ArgsT... Args) { + ResultPool.push_back(std::unique_ptr(new ResultT(Args...))); + return cast(*ResultPool.back()); } + /// Checks if it's legal to vectorize the instructions in \p Bndl. + /// \Returns a LegalityResult object owned by LegalityAnalysis. + LegalityResult &canVectorize(ArrayRef Bndl); }; } // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index fc4355af5af6..d769d5100afd 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize LoopVectorize.cpp SandboxVectorizer/DependencyGraph.cpp SandboxVectorizer/Interval.cpp + SandboxVectorizer/Legality.cpp SandboxVectorizer/Passes/BottomUpVec.cpp SandboxVectorizer/Passes/RegionsFromMetadata.cpp SandboxVectorizer/SandboxVectorizer.cpp diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp new file mode 100644 index 000000000000..0e2cd83c37b0 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -0,0 +1,39 @@ +//===- Legality.cpp -------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h" +#include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Debug.h" + +namespace llvm::sandboxir { + +#ifndef NDEBUG +void LegalityResult::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG + +std::optional +LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes( + ArrayRef Bndl) { + // TODO: Unimplemented. + return std::nullopt; +} + +LegalityResult &LegalityAnalysis::canVectorize(ArrayRef Bndl) { + if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl)) + return createLegalityResult(*ReasonOpt); + + // TODO: Check for existing vectors containing values in Bndl. + + // TODO: Check with scheduler. + + return createLegalityResult(); +} +} // namespace llvm::sandboxir diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 6171d5e52b58..f11420e47f3e 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -50,6 +50,10 @@ void BottomUpVec::vectorizeRec(ArrayRef Bndl) { } break; } + case LegalityResultID::Pack: { + // TODO: Unimplemented + llvm_unreachable("Unimplemented"); + } } } diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp index e16222ddb2d6..76e5a5ce5aed 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp @@ -55,3 +55,24 @@ define void @foo(ptr %ptr) { auto Result = Legality.canVectorize({St0, St1}); EXPECT_TRUE(isa(Result)); } + +#ifndef NDEBUG +TEST_F(LegalityTest, LegalityResultDump) { + auto Matches = [](const sandboxir::LegalityResult &Result, + const std::string &ExpectedStr) -> bool { + std::string Buff; + raw_string_ostream OS(Buff); + Result.print(OS); + return Buff == ExpectedStr; + }; + sandboxir::LegalityAnalysis Legality; + EXPECT_TRUE( + Matches(Legality.createLegalityResult(), "Widen")); + EXPECT_TRUE(Matches(Legality.createLegalityResult( + sandboxir::ResultReason::DiffOpcodes), + "Pack Reason: DiffOpcodes")); + EXPECT_TRUE(Matches(Legality.createLegalityResult( + sandboxir::ResultReason::DiffTypes), + "Pack Reason: DiffTypes")); +} +#endif // NDEBUG -- GitLab From fc59f2cc0f191bb7a0706dfb65e3e46fef69f466 Mon Sep 17 00:00:00 2001 From: RolandF77 <55763885+RolandF77@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:19:07 -0400 Subject: [PATCH 254/511] [PowerPC] special case small int constant for custom scalar_to_vector (#109850) Special case small int constant in the PPC custom lowering of scalar_to_vector. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 10 +- .../build-vector-from-load-and-zeros.ll | 80 ++++----- .../PowerPC/canonical-merge-shuffles.ll | 10 +- llvm/test/CodeGen/PowerPC/const-stov.ll | 164 ++++++++++++++++++ llvm/test/CodeGen/PowerPC/load-and-splat.ll | 14 +- .../CodeGen/PowerPC/p10-splatImm32-undef.ll | 16 +- .../CodeGen/PowerPC/ppc-32bit-build-vector.ll | 35 ++-- 7 files changed, 231 insertions(+), 98 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/const-stov.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index cb0c8bade670..7199fac9b110 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11595,6 +11595,15 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); SDValue Op0 = Op.getOperand(0); + EVT ValVT = Op0.getValueType(); + unsigned EltSize = Op.getValueType().getScalarSizeInBits(); + if (isa(Op0) && EltSize <= 32) { + int64_t IntVal = Op.getConstantOperandVal(0); + if (IntVal >= -16 && IntVal <= 15) + return getCanonicalConstSplat(IntVal, EltSize / 8, Op.getValueType(), DAG, + dl); + } + ReuseLoadInfo RLI; if (Subtarget.hasLFIWAX() && Subtarget.hasVSX() && Op.getValueType() == MVT::v4i32 && Op0.getOpcode() == ISD::LOAD && @@ -11619,7 +11628,6 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); SDValue Val = Op0; - EVT ValVT = Val.getValueType(); // P10 hardware store forwarding requires that a single store contains all // the data for the load. P10 is able to merge a pair of adjacent stores. Try // to avoid load hit store on P10 when running binaries compiled for older diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll index fba6725e2b2a..2259b6e0f44d 100644 --- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll +++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll @@ -26,18 +26,14 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) { ; ; PWR7-LE-LABEL: build_v2i64_extload_0: ; PWR7-LE: # %bb.0: # %entry -; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: stw 4, -16(1) -; PWR7-LE-NEXT: addis 4, 2, .LCPI0_0@toc@ha ; PWR7-LE-NEXT: lfiwzx 0, 0, 3 -; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: addi 4, 4, .LCPI0_0@toc@l -; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxspltw 35, 0, 1 +; PWR7-LE-NEXT: addis 3, 2, .LCPI0_0@toc@ha +; PWR7-LE-NEXT: xxlxor 36, 36, 36 +; PWR7-LE-NEXT: addi 3, 3, .LCPI0_0@toc@l +; PWR7-LE-NEXT: xxspltw 34, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 34, 1 -; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 4, 3, 2 +; PWR7-LE-NEXT: xxswapd 35, 0 +; PWR7-LE-NEXT: vperm 2, 4, 2, 3 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v2i64_extload_0: @@ -357,18 +353,14 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) { ; ; PWR7-LE-LABEL: build_v4i32_load_0: ; PWR7-LE: # %bb.0: # %entry -; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: stw 4, -16(1) -; PWR7-LE-NEXT: addis 4, 2, .LCPI8_0@toc@ha ; PWR7-LE-NEXT: lfiwzx 0, 0, 3 -; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: addi 4, 4, .LCPI8_0@toc@l -; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxspltw 35, 0, 1 +; PWR7-LE-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; PWR7-LE-NEXT: xxlxor 36, 36, 36 +; PWR7-LE-NEXT: addi 3, 3, .LCPI8_0@toc@l +; PWR7-LE-NEXT: xxspltw 34, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 34, 1 -; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 4, 3, 2 +; PWR7-LE-NEXT: xxswapd 35, 0 +; PWR7-LE-NEXT: vperm 2, 4, 2, 3 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_0: @@ -412,18 +404,14 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) { ; ; PWR7-LE-LABEL: build_v4i32_load_1: ; PWR7-LE: # %bb.0: # %entry -; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: stw 4, -16(1) -; PWR7-LE-NEXT: addis 4, 2, .LCPI9_0@toc@ha ; PWR7-LE-NEXT: lfiwzx 0, 0, 3 -; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l -; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxspltw 35, 0, 1 +; PWR7-LE-NEXT: addis 3, 2, .LCPI9_0@toc@ha +; PWR7-LE-NEXT: xxlxor 36, 36, 36 +; PWR7-LE-NEXT: addi 3, 3, .LCPI9_0@toc@l +; PWR7-LE-NEXT: xxspltw 34, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 34, 1 -; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 3, 4, 2 +; PWR7-LE-NEXT: xxswapd 35, 0 +; PWR7-LE-NEXT: vperm 2, 2, 4, 3 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_1: @@ -469,18 +457,14 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) { ; ; PWR7-LE-LABEL: build_v4i32_load_2: ; PWR7-LE: # %bb.0: # %entry -; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: stw 4, -16(1) -; PWR7-LE-NEXT: addis 4, 2, .LCPI10_0@toc@ha ; PWR7-LE-NEXT: lfiwzx 0, 0, 3 -; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l -; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxspltw 35, 0, 1 +; PWR7-LE-NEXT: addis 3, 2, .LCPI10_0@toc@ha +; PWR7-LE-NEXT: xxlxor 36, 36, 36 +; PWR7-LE-NEXT: addi 3, 3, .LCPI10_0@toc@l +; PWR7-LE-NEXT: xxspltw 34, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 34, 1 -; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 3, 4, 2 +; PWR7-LE-NEXT: xxswapd 35, 0 +; PWR7-LE-NEXT: vperm 2, 2, 4, 3 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_2: @@ -524,18 +508,14 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) { ; ; PWR7-LE-LABEL: build_v4i32_load_3: ; PWR7-LE: # %bb.0: # %entry -; PWR7-LE-NEXT: li 4, 0 -; PWR7-LE-NEXT: stw 4, -16(1) -; PWR7-LE-NEXT: addis 4, 2, .LCPI11_0@toc@ha ; PWR7-LE-NEXT: lfiwzx 0, 0, 3 -; PWR7-LE-NEXT: addi 3, 1, -16 -; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l -; PWR7-LE-NEXT: lxvd2x 1, 0, 4 -; PWR7-LE-NEXT: xxspltw 35, 0, 1 +; PWR7-LE-NEXT: addis 3, 2, .LCPI11_0@toc@ha +; PWR7-LE-NEXT: xxlxor 36, 36, 36 +; PWR7-LE-NEXT: addi 3, 3, .LCPI11_0@toc@l +; PWR7-LE-NEXT: xxspltw 34, 0, 1 ; PWR7-LE-NEXT: lxvd2x 0, 0, 3 -; PWR7-LE-NEXT: xxswapd 34, 1 -; PWR7-LE-NEXT: xxswapd 36, 0 -; PWR7-LE-NEXT: vperm 2, 3, 4, 2 +; PWR7-LE-NEXT: xxswapd 35, 0 +; PWR7-LE-NEXT: vperm 2, 2, 4, 3 ; PWR7-LE-NEXT: blr ; ; PWR8-LE-LABEL: build_v4i32_load_3: diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index e1159e56e23e..7f6fdc7f88cd 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -849,16 +849,12 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(ptr nocapture rea ; ; P8-AIX-32-LABEL: no_RAUW_in_combine_during_legalize: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: li r5, 0 ; P8-AIX-32-NEXT: slwi r4, r4, 2 -; P8-AIX-32-NEXT: xxlxor v3, v3, v3 -; P8-AIX-32-NEXT: stw r5, -16(r1) +; P8-AIX-32-NEXT: xxlxor v2, v2, v2 ; P8-AIX-32-NEXT: lfiwzx f0, r3, r4 -; P8-AIX-32-NEXT: addi r3, r1, -16 -; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3 ; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1 -; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0 -; P8-AIX-32-NEXT: vmrghb v2, v2, v3 +; P8-AIX-32-NEXT: xxmrghw v3, v2, vs0 +; P8-AIX-32-NEXT: vmrghb v2, v3, v2 ; P8-AIX-32-NEXT: blr entry: %idx.ext = sext i32 %offset to i64 diff --git a/llvm/test/CodeGen/PowerPC/const-stov.ll b/llvm/test/CodeGen/PowerPC/const-stov.ll new file mode 100644 index 000000000000..69c68a4f2737 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/const-stov.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -verify-machineinstrs \ +; RUN: -mtriple=powerpc64-- -mcpu=pwr7 < %s | FileCheck \ +; RUN: --check-prefix=PWR7-BE %s +; RUN: llc -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -verify-machineinstrs \ +; RUN: -mtriple=powerpc64-- -mcpu=pwr8 < %s | FileCheck \ +; RUN: --check-prefix=PWR8-BE %s +; RUN: llc -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr -verify-machineinstrs \ +; RUN: -mtriple=powerpc64le-- -mcpu=pwr8 < %s | FileCheck \ +; RUN: --check-prefix=PWR8-LE %s + +define <16 x i8> @i8(ptr nocapture noundef readonly %p) { +; PWR7-BE-LABEL: i8: +; PWR7-BE: # %bb.0: # %entry +; PWR7-BE-NEXT: lxvw4x v3, 0, r3 +; PWR7-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; PWR7-BE-NEXT: vspltisb v2, 10 +; PWR7-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; PWR7-BE-NEXT: lxvw4x v4, 0, r3 +; PWR7-BE-NEXT: vperm v2, v3, v2, v4 +; PWR7-BE-NEXT: blr +; +; PWR8-BE-LABEL: i8: +; PWR8-BE: # %bb.0: # %entry +; PWR8-BE-NEXT: lxvw4x v2, 0, r3 +; PWR8-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; PWR8-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; PWR8-BE-NEXT: lxvw4x v3, 0, r3 +; PWR8-BE-NEXT: li r3, 10 +; PWR8-BE-NEXT: mtvsrwz v4, r3 +; PWR8-BE-NEXT: vperm v2, v2, v4, v3 +; PWR8-BE-NEXT: blr +; +; PWR8-LE-LABEL: i8: +; PWR8-LE: # %bb.0: # %entry +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; PWR8-LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; PWR8-LE-NEXT: xxswapd v2, vs0 +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: li r3, 10 +; PWR8-LE-NEXT: mtvsrd v4, r3 +; PWR8-LE-NEXT: xxswapd v3, vs0 +; PWR8-LE-NEXT: vperm v2, v4, v2, v3 +; PWR8-LE-NEXT: blr +entry: + %0 = load <16 x i8>, ptr %p, align 16 + %vecinit1 = insertelement <16 x i8> %0, i8 10, i64 1 + ret <16 x i8> %vecinit1 +} + +define <8 x i16> @i16(ptr nocapture noundef readonly %p) { +; PWR7-BE-LABEL: i16: +; PWR7-BE: # %bb.0: # %entry +; PWR7-BE-NEXT: lxvw4x v3, 0, r3 +; PWR7-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; PWR7-BE-NEXT: vspltish v2, 9 +; PWR7-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; PWR7-BE-NEXT: lxvw4x v4, 0, r3 +; PWR7-BE-NEXT: vperm v2, v3, v2, v4 +; PWR7-BE-NEXT: blr +; +; PWR8-BE-LABEL: i16: +; PWR8-BE: # %bb.0: # %entry +; PWR8-BE-NEXT: lxvw4x v2, 0, r3 +; PWR8-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; PWR8-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; PWR8-BE-NEXT: lxvw4x v3, 0, r3 +; PWR8-BE-NEXT: li r3, 9 +; PWR8-BE-NEXT: mtvsrwz v4, r3 +; PWR8-BE-NEXT: vperm v2, v2, v4, v3 +; PWR8-BE-NEXT: blr +; +; PWR8-LE-LABEL: i16: +; PWR8-LE: # %bb.0: # %entry +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; PWR8-LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; PWR8-LE-NEXT: xxswapd v2, vs0 +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: li r3, 9 +; PWR8-LE-NEXT: mtvsrd v4, r3 +; PWR8-LE-NEXT: xxswapd v3, vs0 +; PWR8-LE-NEXT: vperm v2, v4, v2, v3 +; PWR8-LE-NEXT: blr +entry: + %0 = load <8 x i16>, ptr %p, align 16 + %vecinit1 = insertelement <8 x i16> %0, i16 9, i64 1 + ret <8 x i16> %vecinit1 +} + +define <4 x i32> @i32(ptr nocapture noundef readonly %p) { +; PWR7-BE-LABEL: i32: +; PWR7-BE: # %bb.0: # %entry +; PWR7-BE-NEXT: lxvw4x v3, 0, r3 +; PWR7-BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; PWR7-BE-NEXT: vspltisw v2, 7 +; PWR7-BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; PWR7-BE-NEXT: lxvw4x v4, 0, r3 +; PWR7-BE-NEXT: vperm v2, v3, v2, v4 +; PWR7-BE-NEXT: blr +; +; PWR8-BE-LABEL: i32: +; PWR8-BE: # %bb.0: # %entry +; PWR8-BE-NEXT: lxvw4x v2, 0, r3 +; PWR8-BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; PWR8-BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; PWR8-BE-NEXT: lxvw4x v3, 0, r3 +; PWR8-BE-NEXT: li r3, 7 +; PWR8-BE-NEXT: mtvsrwz v4, r3 +; PWR8-BE-NEXT: vperm v2, v2, v4, v3 +; PWR8-BE-NEXT: blr +; +; PWR8-LE-LABEL: i32: +; PWR8-LE: # %bb.0: # %entry +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; PWR8-LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; PWR8-LE-NEXT: xxswapd v2, vs0 +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: li r3, 7 +; PWR8-LE-NEXT: mtvsrwz v4, r3 +; PWR8-LE-NEXT: xxswapd v3, vs0 +; PWR8-LE-NEXT: vperm v2, v4, v2, v3 +; PWR8-LE-NEXT: blr +entry: + %0 = load <4 x i32>, ptr %p, align 16 + %vecinit1 = insertelement <4 x i32> %0, i32 7, i64 1 + ret <4 x i32> %vecinit1 +} + +define <2 x i64> @i64(ptr nocapture noundef readonly %p) { +; PWR7-BE-LABEL: i64: +; PWR7-BE: # %bb.0: # %entry +; PWR7-BE-NEXT: lxvd2x v2, 0, r3 +; PWR7-BE-NEXT: li r3, 10 +; PWR7-BE-NEXT: std r3, -16(r1) +; PWR7-BE-NEXT: std r3, -8(r1) +; PWR7-BE-NEXT: addi r3, r1, -16 +; PWR7-BE-NEXT: lxvd2x v3, 0, r3 +; PWR7-BE-NEXT: xxmrghd v2, v2, v3 +; PWR7-BE-NEXT: blr +; +; PWR8-BE-LABEL: i64: +; PWR8-BE: # %bb.0: # %entry +; PWR8-BE-NEXT: lxvd2x v2, 0, r3 +; PWR8-BE-NEXT: li r3, 10 +; PWR8-BE-NEXT: mtfprd f0, r3 +; PWR8-BE-NEXT: xxmrghd v2, v2, vs0 +; PWR8-BE-NEXT: blr +; +; PWR8-LE-LABEL: i64: +; PWR8-LE: # %bb.0: # %entry +; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: li r3, 10 +; PWR8-LE-NEXT: xxswapd v2, vs0 +; PWR8-LE-NEXT: mtfprd f0, r3 +; PWR8-LE-NEXT: xxpermdi v2, vs0, v2, 1 +; PWR8-LE-NEXT: blr +entry: + %0 = load <2 x i64>, ptr %p, align 16 + %vecinit1 = insertelement <2 x i64> %0, i64 10, i64 1 + ret <2 x i64> %vecinit1 +} diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll index c9ee3a51f417..1993b1678b3e 100644 --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -356,11 +356,9 @@ define void @test6(ptr %a, ptr %in) { ; ; P9-AIX32-LABEL: test6: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: li r5, 0 -; P9-AIX32-NEXT: stw r5, -16(r1) ; P9-AIX32-NEXT: lwz r5, L..C2(r2) # %const.0 ; P9-AIX32-NEXT: lxvwsx vs1, 0, r4 -; P9-AIX32-NEXT: lxv vs2, -16(r1) +; P9-AIX32-NEXT: xxlxor vs2, vs2, vs2 ; P9-AIX32-NEXT: lxv vs0, 0(r5) ; P9-AIX32-NEXT: xxperm vs1, vs2, vs0 ; P9-AIX32-NEXT: stxv vs1, 0(r3) @@ -368,13 +366,10 @@ define void @test6(ptr %a, ptr %in) { ; ; P8-AIX32-LABEL: test6: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: li r5, 0 -; P8-AIX32-NEXT: stw r5, -16(r1) ; P8-AIX32-NEXT: lfiwzx f0, 0, r4 ; P8-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 +; P8-AIX32-NEXT: xxlxor v4, v4, v4 ; P8-AIX32-NEXT: lxvw4x v3, 0, r4 -; P8-AIX32-NEXT: addi r4, r1, -16 -; P8-AIX32-NEXT: lxvw4x v4, 0, r4 ; P8-AIX32-NEXT: xxspltw v2, vs0, 1 ; P8-AIX32-NEXT: vperm v2, v4, v2, v3 ; P8-AIX32-NEXT: stxvw4x v2, 0, r3 @@ -382,13 +377,10 @@ define void @test6(ptr %a, ptr %in) { ; ; P7-AIX32-LABEL: test6: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: li r5, 0 -; P7-AIX32-NEXT: stw r5, -16(r1) ; P7-AIX32-NEXT: lfiwzx f0, 0, r4 ; P7-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0 +; P7-AIX32-NEXT: xxlxor v4, v4, v4 ; P7-AIX32-NEXT: lxvw4x v3, 0, r4 -; P7-AIX32-NEXT: addi r4, r1, -16 -; P7-AIX32-NEXT: lxvw4x v4, 0, r4 ; P7-AIX32-NEXT: xxspltw v2, vs0, 1 ; P7-AIX32-NEXT: vperm v2, v4, v2, v3 ; P7-AIX32-NEXT: stxvw4x v2, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32-undef.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32-undef.ll index ad6a576fbf50..04e7110b669a 100644 --- a/llvm/test/CodeGen/PowerPC/p10-splatImm32-undef.ll +++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32-undef.ll @@ -60,15 +60,13 @@ define hidden void @function1() { ; CHECK-LINUX-32: # %bb.0: # %entry ; CHECK-LINUX-32-NEXT: mflr r0 ; CHECK-LINUX-32-NEXT: stw r0, 4(r1) -; CHECK-LINUX-32-NEXT: stwu r1, -48(r1) -; CHECK-LINUX-32-NEXT: .cfi_def_cfa_offset 48 +; CHECK-LINUX-32-NEXT: stwu r1, -32(r1) +; CHECK-LINUX-32-NEXT: .cfi_def_cfa_offset 32 ; CHECK-LINUX-32-NEXT: .cfi_offset lr, 4 ; CHECK-LINUX-32-NEXT: bl call1 -; CHECK-LINUX-32-NEXT: li r4, 0 ; CHECK-LINUX-32-NEXT: stw r3, 16(r1) -; CHECK-LINUX-32-NEXT: stw r4, 32(r1) -; CHECK-LINUX-32-NEXT: lwz r0, 52(r1) -; CHECK-LINUX-32-NEXT: addi r1, r1, 48 +; CHECK-LINUX-32-NEXT: lwz r0, 36(r1) +; CHECK-LINUX-32-NEXT: addi r1, r1, 32 ; CHECK-LINUX-32-NEXT: mtlr r0 ; CHECK-LINUX-32-NEXT: blr ; @@ -76,13 +74,11 @@ define hidden void @function1() { ; CHECK-AIX-32: # %bb.0: # %entry ; CHECK-AIX-32-NEXT: mflr r0 ; CHECK-AIX-32-NEXT: stw r0, 8(r1) -; CHECK-AIX-32-NEXT: stwu r1, -96(r1) +; CHECK-AIX-32-NEXT: stwu r1, -80(r1) ; CHECK-AIX-32-NEXT: bl .call1[PR] ; CHECK-AIX-32-NEXT: nop -; CHECK-AIX-32-NEXT: li r4, 0 ; CHECK-AIX-32-NEXT: stw r3, 64(r1) -; CHECK-AIX-32-NEXT: stw r4, 80(r1) -; CHECK-AIX-32-NEXT: addi r1, r1, 96 +; CHECK-AIX-32-NEXT: addi r1, r1, 80 ; CHECK-AIX-32-NEXT: lwz r0, 8(r1) ; CHECK-AIX-32-NEXT: mtlr r0 ; CHECK-AIX-32-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll index 0171e27e8090..35b478017383 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll @@ -8,30 +8,27 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr { ; 32BIT-LABEL: BuildVectorICE: ; 32BIT: # %bb.0: # %entry -; 32BIT-NEXT: stwu 1, -64(1) -; 32BIT-NEXT: .cfi_def_cfa_offset 64 -; 32BIT-NEXT: li 4, .LCPI0_0@l -; 32BIT-NEXT: lis 5, .LCPI0_0@ha +; 32BIT-NEXT: stwu 1, -48(1) +; 32BIT-NEXT: .cfi_def_cfa_offset 48 ; 32BIT-NEXT: lxvw4x 34, 0, 3 -; 32BIT-NEXT: li 3, 0 -; 32BIT-NEXT: addi 6, 1, 48 -; 32BIT-NEXT: li 7, 0 -; 32BIT-NEXT: lxvw4x 35, 5, 4 +; 32BIT-NEXT: li 3, .LCPI0_0@l +; 32BIT-NEXT: lis 4, .LCPI0_0@ha +; 32BIT-NEXT: li 5, 0 +; 32BIT-NEXT: xxlxor 36, 36, 36 +; 32BIT-NEXT: lxvw4x 35, 4, 3 +; 32BIT-NEXT: addi 3, 1, 16 ; 32BIT-NEXT: addi 4, 1, 32 -; 32BIT-NEXT: addi 5, 1, 16 ; 32BIT-NEXT: .p2align 4 ; 32BIT-NEXT: .LBB0_1: # %while.body ; 32BIT-NEXT: # -; 32BIT-NEXT: stw 3, 32(1) -; 32BIT-NEXT: stw 7, 16(1) -; 32BIT-NEXT: lxvw4x 36, 0, 4 -; 32BIT-NEXT: lxvw4x 37, 0, 5 -; 32BIT-NEXT: vperm 4, 5, 4, 3 -; 32BIT-NEXT: vadduwm 4, 2, 4 -; 32BIT-NEXT: xxspltw 37, 36, 1 -; 32BIT-NEXT: vadduwm 4, 4, 5 -; 32BIT-NEXT: stxvw4x 36, 0, 6 -; 32BIT-NEXT: lwz 7, 48(1) +; 32BIT-NEXT: stw 5, 16(1) +; 32BIT-NEXT: lxvw4x 37, 0, 3 +; 32BIT-NEXT: vperm 5, 5, 4, 3 +; 32BIT-NEXT: vadduwm 5, 2, 5 +; 32BIT-NEXT: xxspltw 32, 37, 1 +; 32BIT-NEXT: vadduwm 5, 5, 0 +; 32BIT-NEXT: stxvw4x 37, 0, 4 +; 32BIT-NEXT: lwz 5, 32(1) ; 32BIT-NEXT: b .LBB0_1 ; ; 64BIT-LABEL: BuildVectorICE: -- GitLab From f1e455ed51be4f53462db87aa5d64dbd830e5de2 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Mon, 21 Oct 2024 12:21:50 -0400 Subject: [PATCH 255/511] [NFC][Sema][OpenMP] Fix free-nonheap-object warning (#112942) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one of the many PRs to fix errors with LLVM_ENABLE_WERROR=on. Built by GCC 11. Fix warning In destructor ‘llvm::APInt::~APInt()’, inlined from ‘llvm::APInt::~APInt()’ at llvm-project/llvm/include/llvm/ADT/APInt.h:190:3, inlined from ‘llvm::APSInt::~APSInt()’ at llvm-project/llvm/include/llvm/ADT/APSInt.h:23:21, inlined from ‘bool checkOMPArraySectionConstantForReduction(clang::ASTContext&, const clang::ArraySectionExpr*, bool&, llvm::SmallVectorImpl&)’ at llvm-project/clang/lib/Sema/SemaOpenMP.cpp:18357:45, inlined from ‘bool actOnOMPReductionKindClause(clang::Sema&, {anonymous}::DSAStackTy*, clang::OpenMPClauseKind, llvm::ArrayRef, clang::SourceLocation, clang::SourceLocation, clang::SourceLocation, clang::SourceLocation, clang::CXXScopeSpec&, const clang::DeclarationNameInfo&, llvm::ArrayRef, {anonymous}::ReductionData&)’ at llvm-project/clang/lib/Sema/SemaOpenMP.cpp:18715:68: llvm-project/llvm/include/llvm/ADT/APInt.h:192:18: error: ‘void operator delete [](void*)’ called on a pointer to an unallocated object ‘1’ [-Werror=free-nonheap-object] 192 | delete[] U.pVal; | ^~~~ --- clang/lib/Sema/SemaOpenMP.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 0232745b3c19..fa81fc42c0ee 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -18335,7 +18335,8 @@ static bool checkOMPArraySectionConstantForReduction( return false; // This is an array subscript which has implicit length 1! - ArraySizes.push_back(llvm::APSInt::get(1)); + llvm::APSInt ConstantOne = llvm::APSInt::get(1); + ArraySizes.push_back(ConstantOne); } else { Expr::EvalResult Result; if (!Length->EvaluateAsInt(Result, Context)) @@ -18354,7 +18355,8 @@ static bool checkOMPArraySectionConstantForReduction( if (!SingleElement) { while (const auto *TempASE = dyn_cast(Base)) { // Has implicit length 1! - ArraySizes.push_back(llvm::APSInt::get(1)); + llvm::APSInt ConstantOne = llvm::APSInt::get(1); + ArraySizes.push_back(ConstantOne); Base = TempASE->getBase()->IgnoreParenImpCasts(); } } -- GitLab From d4630ae5ed678e50f4758d0fb7a6875494f690e5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 09:22:10 -0700 Subject: [PATCH 256/511] [Vectorize] Fix a warning This patch fixes: llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h:85:16: error: private field 'Reason' is not used [-Werror,-Wunused-private-field] --- .../llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 233abf3efd64..8f698f44c7c0 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -82,7 +82,7 @@ public: /// Base class for results with reason. class LegalityResultWithReason : public LegalityResult { - ResultReason Reason; + [[maybe_unused]] ResultReason Reason; LegalityResultWithReason(LegalityResultID ID, ResultReason Reason) : LegalityResult(ID), Reason(Reason) {} friend class Pack; // For constructor. -- GitLab From 006fb0904d8e549476342de4b749792f73b3af85 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 21 Oct 2024 16:26:55 +0000 Subject: [PATCH 257/511] [gn build] Port 54c93aabec96 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index d54b12e3a20d..ea0f9b872308 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -15,6 +15,7 @@ static_library("Vectorize") { "SLPVectorizer.cpp", "SandboxVectorizer/DependencyGraph.cpp", "SandboxVectorizer/Interval.cpp", + "SandboxVectorizer/Legality.cpp", "SandboxVectorizer/Passes/BottomUpVec.cpp", "SandboxVectorizer/Passes/RegionsFromMetadata.cpp", "SandboxVectorizer/SandboxVectorizer.cpp", -- GitLab From 3277c7cd28154e33637a168acb26cea7ac1f7fff Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 21 Oct 2024 09:39:52 -0700 Subject: [PATCH 258/511] [AMDGPU] Skip VGPR deallocation for waveslot limited kernels (#112765) MSG_DEALLOC_VGPRS slows down very small waveslot limited kernels. It's been identified this message is only really needed for VGPR limited kernels. A kernel becomes VGPR limited if a total number of VGPRs per SIMD / number of used VGPRs is more than a number of wave slots. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 25 +- .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 12 - .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 28 - .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 30 - .../AMDGPU/GlobalISel/extractelement.ll | 16 - .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 24 - .../AMDGPU/GlobalISel/insertelement.i16.ll | 62 -- .../AMDGPU/GlobalISel/insertelement.i8.ll | 62 -- .../AMDGPU/GlobalISel/insertelement.large.ll | 2 - .../AMDGPU/GlobalISel/insertelement.ll | 18 - .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 2 - .../GlobalISel/llvm.amdgcn.div.scale.ll | 48 -- .../GlobalISel/llvm.amdgcn.end.cf.i32.ll | 2 - .../llvm.amdgcn.global.atomic.csub.ll | 4 - .../GlobalISel/llvm.amdgcn.if.break.i32.ll | 2 - .../GlobalISel/llvm.amdgcn.image.store.2d.ll | 44 - .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 4 - .../GlobalISel/llvm.amdgcn.update.dpp.ll | 12 - .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll | 52 -- .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll | 52 -- .../AMDGPU/GlobalISel/load-unaligned.ll | 2 - .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 32 - .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 22 - llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 16 - .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 4 - ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 56 -- .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 44 - .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll | 36 - .../wmma-gfx12-w32-swmmac-index_key.ll | 20 - .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll | 44 - ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 56 -- .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 44 - .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll | 36 - .../wmma-gfx12-w64-swmmac-index_key.ll | 22 - .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll | 44 - llvm/test/CodeGen/AMDGPU/add.ll | 40 - llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 26 - .../AMDGPU/atomic_optimizations_buffer.ll | 72 -- .../atomic_optimizations_global_pointer.ll | 128 --- .../atomic_optimizations_local_pointer.ll | 192 ----- .../atomic_optimizations_pixelshader.ll | 8 - .../AMDGPU/atomic_optimizations_raw_buffer.ll | 64 -- .../atomic_optimizations_struct_buffer.ll | 80 -- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 8 - llvm/test/CodeGen/AMDGPU/bitreverse.ll | 40 - llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 8 - llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 78 +- llvm/test/CodeGen/AMDGPU/bswap.ll | 14 - llvm/test/CodeGen/AMDGPU/build_vector.ll | 10 - .../CodeGen/AMDGPU/calling-conventions.ll | 62 -- .../test/CodeGen/AMDGPU/carryout-selection.ll | 34 - llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 2 - llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 40 - llvm/test/CodeGen/AMDGPU/clamp.ll | 196 ----- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 6 - .../AMDGPU/commute-compares-scalar-float.ll | 64 -- llvm/test/CodeGen/AMDGPU/ctlz.ll | 32 - llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 38 - .../AMDGPU/divergence-driven-buildvector.ll | 12 - llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 6 - .../expand-scalar-carry-out-select-user.ll | 2 - .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 22 - llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 22 - llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 48 -- .../fast-unaligned-load-store.global.ll | 12 - llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 94 --- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 220 ----- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 58 -- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 44 - llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 38 - llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 32 - llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 34 - llvm/test/CodeGen/AMDGPU/fdiv.ll | 36 - .../AMDGPU/fix-sgpr-copies-nondeterminism.ll | 2 - llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 24 - llvm/test/CodeGen/AMDGPU/fma-combine.ll | 90 --- llvm/test/CodeGen/AMDGPU/fmax3.ll | 8 - llvm/test/CodeGen/AMDGPU/fmaximum.ll | 4 - llvm/test/CodeGen/AMDGPU/fmed3.ll | 130 --- llvm/test/CodeGen/AMDGPU/fmin3.ll | 12 - llvm/test/CodeGen/AMDGPU/fminimum.ll | 4 - .../AMDGPU/fmul-2-combine-multi-use.ll | 32 - llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 16 - llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 98 --- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 14 - llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 22 - .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 2 - llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 22 - llvm/test/CodeGen/AMDGPU/fneg.ll | 28 - .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 20 - llvm/test/CodeGen/AMDGPU/fp-classify.ll | 30 - .../AMDGPU/fp-min-max-buffer-atomics.ll | 30 - .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 20 - .../AMDGPU/fp-min-max-num-global-atomics.ll | 8 - llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 4 - llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 4 - llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 4 - llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 56 -- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 28 - llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 28 - llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 40 - llvm/test/CodeGen/AMDGPU/fptrunc.ll | 28 - llvm/test/CodeGen/AMDGPU/frem.ll | 56 -- llvm/test/CodeGen/AMDGPU/fshl.ll | 14 - llvm/test/CodeGen/AMDGPU/fshr.ll | 12 - llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 12 - .../AMDGPU/gfx12_scalar_subword_loads.ll | 92 --- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 4 - .../CodeGen/AMDGPU/global-saddr-atomics.ll | 16 - .../test/CodeGen/AMDGPU/global-saddr-store.ll | 244 ------ .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 114 --- .../AMDGPU/global_atomics_scan_fadd.ll | 24 - .../AMDGPU/global_atomics_scan_fmax.ll | 40 - .../AMDGPU/global_atomics_scan_fmin.ll | 40 - llvm/test/CodeGen/AMDGPU/half.ll | 88 -- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 16 - llvm/test/CodeGen/AMDGPU/idot4s.ll | 34 - llvm/test/CodeGen/AMDGPU/idot4u.ll | 62 -- .../test/CodeGen/AMDGPU/image-load-d16-tfe.ll | 14 - llvm/test/CodeGen/AMDGPU/imm16.ll | 66 -- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 70 -- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 2 - .../AMDGPU/llvm.amdgcn.bitreplicate.ll | 2 - .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 10 - .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 18 - .../AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll | 4 - .../AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll | 4 - .../AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll | 4 - .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 188 ----- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 96 --- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 4 - .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 2 - .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 4 - ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 4 - .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 8 - .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 8 - .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 128 --- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 68 -- .../AMDGPU/llvm.amdgcn.image.a16.dim.ll | 326 ++------ .../AMDGPU/llvm.amdgcn.image.a16.encode.ll | 76 -- .../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 349 ++------ .../AMDGPU/llvm.amdgcn.image.store.a16.d16.ll | 48 -- .../AMDGPU/llvm.amdgcn.image.store.a16.ll | 48 -- .../AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll | 16 - .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll | 28 - .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 2 - .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 2 - .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 752 ------------------ .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 104 --- .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 6 - .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 16 - .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll | 4 - .../AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll | 40 - .../AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 48 -- .../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 8 - .../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 8 - .../llvm.amdgcn.raw.ptr.tbuffer.store.ll | 22 - .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 18 - .../AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll | 44 - .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 32 - .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 32 - .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 6 - .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 96 --- .../AMDGPU/llvm.amdgcn.s.buffer.load.ll | 4 - .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 24 - .../llvm.amdgcn.struct.buffer.load.tfe.ll | 40 - .../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 34 - ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll | 8 - .../llvm.amdgcn.struct.ptr.tbuffer.store.ll | 28 - .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 18 - .../llvm.amdgcn.struct.tbuffer.store.ll | 56 -- .../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll | 2 - .../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll | 52 -- .../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll | 52 -- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll | 4 - .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 76 -- llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 8 - llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 4 - llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll | 8 - llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 16 - llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll | 2 - .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 2 - .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 2 - llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 4 - llvm/test/CodeGen/AMDGPU/llvm.log.ll | 16 - llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 16 - llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 16 - llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 18 - llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 18 - llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 8 - llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 4 - llvm/test/CodeGen/AMDGPU/llvm.round.ll | 12 - llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 2 - llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 4 - llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 4 - llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 4 - .../AMDGPU/load-constant-always-uniform.ll | 4 - llvm/test/CodeGen/AMDGPU/load-constant-f32.ll | 2 - llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 4 - llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 88 -- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 78 -- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 50 -- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 12 - llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 104 --- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 8 - .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 4 - .../lower-work-group-id-intrinsics-hsa.ll | 4 - .../lower-work-group-id-intrinsics-pal.ll | 70 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 16 - llvm/test/CodeGen/AMDGPU/mad.u16.ll | 2 - llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 4 - llvm/test/CodeGen/AMDGPU/madak.ll | 40 - .../match-perm-extract-vector-elt-bug.ll | 2 - .../CodeGen/AMDGPU/max-hard-clause-length.ll | 4 - llvm/test/CodeGen/AMDGPU/min.ll | 70 -- llvm/test/CodeGen/AMDGPU/minimummaximum.ll | 8 - llvm/test/CodeGen/AMDGPU/minmax.ll | 16 - llvm/test/CodeGen/AMDGPU/mul.ll | 76 -- .../CodeGen/AMDGPU/offset-split-global.ll | 142 ---- llvm/test/CodeGen/AMDGPU/omod.ll | 90 --- .../CodeGen/AMDGPU/release-vgprs-dbg-loc.mir | 4 +- llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 136 ++-- llvm/test/CodeGen/AMDGPU/rotl.ll | 6 - llvm/test/CodeGen/AMDGPU/rotr.ll | 6 - llvm/test/CodeGen/AMDGPU/saddo.ll | 12 - llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 112 --- llvm/test/CodeGen/AMDGPU/select.f16.ll | 20 - llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 16 - .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 74 -- llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 16 - llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 20 - llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 6 - llvm/test/CodeGen/AMDGPU/sub.ll | 26 - llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 26 - llvm/test/CodeGen/AMDGPU/trap-abis.ll | 6 - llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 16 - llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 20 - llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 46 -- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 4 - llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 8 - .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 6 - .../AMDGPU/vgpr-mark-last-scratch-load.ll | 4 - .../wait-before-stores-with-scope_sys.ll | 4 - llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 24 - ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 56 -- .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll | 44 - .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll | 36 - .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll | 20 - llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll | 44 - ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 56 -- .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll | 44 - .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll | 36 - .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll | 22 - llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll | 44 - llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll | 44 - llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll | 44 - .../AMDGPU/workgroup-id-in-arch-sgprs.ll | 6 - 257 files changed, 335 insertions(+), 9655 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8f1757db8a85..1e9a61d103a2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2606,15 +2606,24 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM // instructions. - for (MachineInstr *MI : ReleaseVGPRInsts) { - if (ST->requiresNopBeforeDeallocVGPRs()) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP)) - .addImm(0); + // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short + // waveslot limited kernel runs slower with the deallocation. + if (!ReleaseVGPRInsts.empty() && + (MF.getFrameInfo().hasCalls() || + ST->getOccupancyWithNumVGPRs( + TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < + AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + for (MachineInstr *MI : ReleaseVGPRInsts) { + if (ST->requiresNopBeforeDeallocVGPRs()) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_NOP)) + .addImm(0); + } + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_SENDMSG)) + .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + Modified = true; } - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_SENDMSG)) - .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); - Modified = true; } ReleaseVGPRInsts.clear(); PreheadersToFlush.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index 359c1e53de99..ad3c588f5755 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -15,8 +15,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_u64: @@ -30,8 +28,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -45,8 +41,6 @@ define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -67,8 +61,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_u64: @@ -82,8 +74,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %sub = sub i64 %a, %b @@ -97,8 +87,6 @@ define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4 ; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm entry: %sub = sub i64 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 705bcbddf227..e28a1efb7540 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id @@ -2302,8 +2290,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 @@ -2390,8 +2376,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -2474,8 +2458,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 @@ -2679,8 +2661,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -2762,8 +2742,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 @@ -2846,8 +2824,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -3153,8 +3129,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id @@ -3334,8 +3308,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index b3a7e65f771c..d63044d7cec6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 @@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id @@ -967,8 +955,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 @@ -1055,8 +1041,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1139,8 +1123,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 @@ -1344,8 +1326,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1427,8 +1407,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 @@ -1511,8 +1489,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -1818,8 +1794,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id @@ -2680,8 +2654,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 @@ -3541,8 +3513,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 34efb089b72b..ca6e5df43a04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -480,8 +480,6 @@ define amdgpu_ps void @dyn_extract_v8i64_const_s_s(i32 inreg %sel) { ; GFX11-NEXT: s_movrels_b64 s[0:1], s[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> , i32 %sel @@ -627,8 +625,6 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s16, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s17, vcc_lo ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel @@ -745,8 +741,6 @@ define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) { ; GFX11-NEXT: v_movrels_b32_e32 v16, v0 ; GFX11-NEXT: v_movrels_b32_e32 v17, v1 ; GFX11-NEXT: global_store_b64 v[0:1], v[16:17], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel @@ -852,8 +846,6 @@ define amdgpu_ps void @dyn_extract_v8i64_s_s(<8 x i64> inreg %vec, i32 inreg %se ; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel @@ -1805,8 +1797,6 @@ define amdgpu_ps void @dyn_extract_v8p1_s_s(<8 x ptr addrspace(1)> inreg %vec, i ; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x ptr addrspace(1)> %vec, i32 %idx @@ -3379,8 +3369,6 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel @@ -4369,8 +4357,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: s_cselect_b32 s2, 4.0, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel @@ -4727,8 +4713,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 6e2e88f22600..097640312322 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -3699,8 +3699,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: scratch_load_b32 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_large_offset: @@ -3708,8 +3706,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset: @@ -3748,8 +3744,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v2, off, s0 ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX11-NEXT: s_nop 0 -; UNALIGNED_GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX11-NEXT: s_endpgm ; ; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset: @@ -3757,8 +3751,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX12-NEXT: s_nop 0 -; UNALIGNED_GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX12-NEXT: s_endpgm entry: %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 @@ -3809,8 +3801,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_large_offset_split: @@ -3821,8 +3811,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset_split: @@ -3866,8 +3854,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX11-NEXT: s_nop 0 -; UNALIGNED_GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX11-NEXT: s_endpgm ; ; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset_split: @@ -3878,8 +3864,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX12-NEXT: s_nop 0 -; UNALIGNED_GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX12-NEXT: s_endpgm entry: ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5) @@ -4041,8 +4025,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_negative_offset: @@ -4050,8 +4032,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; UNALIGNED_GFX9-LABEL: sgpr_base_negative_offset: @@ -4088,8 +4068,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX11-NEXT: s_nop 0 -; UNALIGNED_GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX11-NEXT: s_endpgm ; ; UNALIGNED_GFX12-LABEL: sgpr_base_negative_offset: @@ -4097,8 +4075,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: global_store_b32 v[0:1], v2, off -; UNALIGNED_GFX12-NEXT: s_nop 0 -; UNALIGNED_GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; UNALIGNED_GFX12-NEXT: s_endpgm entry: %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 676a02aaf8a1..4ae98ff1edf6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -90,8 +90,6 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -184,8 +182,6 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1 ) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -272,8 +268,6 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -367,8 +361,6 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -459,8 +451,6 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -556,8 +546,6 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -649,8 +637,6 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -743,8 +729,6 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -925,8 +909,6 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1 ) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1058,8 +1040,6 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1200,8 +1180,6 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1339,8 +1317,6 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1467,8 +1443,6 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1590,8 +1564,6 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1716,8 +1688,6 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx @@ -1900,8 +1870,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2057,8 +2025,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1 ) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2240,8 +2206,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2428,8 +2392,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2616,8 +2578,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2774,8 +2734,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -2928,8 +2886,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -3083,8 +3039,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx @@ -3267,8 +3221,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3423,8 +3375,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[10:11], v[6:9], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1 ) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3605,8 +3555,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3900,8 +3848,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4195,8 +4141,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4443,8 +4387,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off ; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4596,8 +4538,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off ; GFX11-NEXT: global_store_b128 v[11:12], v[7:10], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4841,8 +4781,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off ; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index d5bfb7faf7fc..d4b9bc6d2e3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -105,8 +105,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -209,8 +207,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1 ) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -314,8 +310,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -423,8 +417,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -528,8 +520,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -632,8 +622,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -733,8 +721,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -834,8 +820,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -985,8 +969,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1 ) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1073,8 +1055,6 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1168,8 +1148,6 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1260,8 +1238,6 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1357,8 +1333,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1450,8 +1424,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1544,8 +1516,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx @@ -1679,8 +1649,6 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -1804,8 +1772,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1 ) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -1937,8 +1903,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2079,8 +2043,6 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2218,8 +2180,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2346,8 +2306,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2469,8 +2427,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2595,8 +2551,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx @@ -2779,8 +2733,6 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -2936,8 +2888,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1 ) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3119,8 +3069,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3307,8 +3255,6 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3495,8 +3441,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3653,8 +3597,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3807,8 +3749,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx @@ -3962,8 +3902,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 5185f6c4ada5..8ff1e1d8d072 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -148,8 +148,6 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:240 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, ptr addrspace(1) %ptr.in, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 49d9ad1c0f79..df1afdf77983 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1078,8 +1078,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[13:16], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1228,8 +1226,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1290,8 +1286,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1497,8 +1491,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[15:18], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1622,8 +1614,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1684,8 +1674,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double % ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1803,8 +1791,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -2411,8 +2397,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 @@ -2537,8 +2521,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 2edcf23df411..927a31d3992b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -478,8 +478,6 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %cmp = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index f88125ea0293..19ccb476a0a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -74,8 +74,6 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -159,8 +157,6 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -250,8 +246,6 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -341,8 +335,6 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -421,8 +413,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -499,8 +489,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -577,8 +565,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -655,8 +641,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -734,8 +718,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -813,8 +795,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -892,8 +872,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -971,8 +949,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -1035,8 +1011,6 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1094,8 +1068,6 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) %result0 = extractvalue { float, i1 } %result, 0 @@ -1157,8 +1129,6 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) %result0 = extractvalue { double, i1 } %result, 0 @@ -1220,8 +1190,6 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) %result0 = extractvalue { double, i1 } %result, 0 @@ -1287,8 +1255,6 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1358,8 +1324,6 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1445,8 +1409,6 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1537,8 +1499,6 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1594,8 +1554,6 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1642,8 +1600,6 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1688,8 +1644,6 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1738,8 +1692,6 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) %result0 = extractvalue { double, i1 } %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index 6415e185446f..287546750d87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -40,8 +40,6 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 59818b0b1bc3..4a9594ad45e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -154,8 +154,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: @@ -166,8 +164,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll index 81c73c789239..572894af8851 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -33,8 +33,6 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll index 9e445d034edc..c1c383eb583a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -56,8 +56,6 @@ define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, fl ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_f32: @@ -71,8 +69,6 @@ define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, fl ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v2, [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f32.i32(float %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -129,8 +125,6 @@ define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v2f32: @@ -144,8 +138,6 @@ define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:3], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -202,8 +194,6 @@ define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v3f32: @@ -217,8 +207,6 @@ define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:4], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -275,8 +263,6 @@ define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32: @@ -290,8 +276,6 @@ define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -348,8 +332,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_0001: @@ -363,8 +345,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -421,8 +401,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_0010: @@ -436,8 +414,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 2, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -494,8 +470,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_0100: @@ -509,8 +483,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 4, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -567,8 +539,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_1000: @@ -582,8 +552,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 8, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -640,8 +608,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_0011: @@ -655,8 +621,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -713,8 +677,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 % ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_v4f32_dmask_0110: @@ -728,8 +690,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 % ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 6, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -793,8 +753,6 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_f32_dmask_1111: @@ -809,8 +767,6 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: image_store v0, [v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm tail call void @llvm.amdgcn.image.store.2d.f32.i32(float %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 4946d3759c2e..3b402f919f34 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -40,8 +40,6 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 store i32 %tmp0, ptr addrspace(1) %out @@ -82,8 +80,6 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 1e8209bd3fc6..4b0f2ef77a98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -36,8 +36,6 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, ptr addrspace(1) %out @@ -88,8 +86,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id @@ -144,8 +140,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id @@ -200,8 +194,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id @@ -256,8 +248,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id @@ -312,8 +302,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll index cc0e34be02a7..603eb88c07af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) @@ -37,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) @@ -55,8 +51,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) @@ -71,8 +65,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) @@ -91,8 +83,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) @@ -118,8 +108,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) @@ -138,8 +126,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) @@ -154,8 +140,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) @@ -174,8 +158,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) @@ -201,8 +183,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) @@ -221,8 +201,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -237,8 +215,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -253,8 +229,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -269,8 +243,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -285,8 +257,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -301,8 +271,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -317,8 +285,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -333,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -351,8 +315,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -367,8 +329,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -383,8 +343,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -399,8 +357,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -416,8 +372,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -432,8 +386,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -448,8 +400,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -464,8 +414,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll index 112a7d98ce7e..7deaca4ca78b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -17,8 +17,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) @@ -33,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) @@ -49,8 +45,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) @@ -63,8 +57,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) @@ -79,8 +71,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) @@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) @@ -120,8 +108,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) @@ -134,8 +120,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) @@ -150,8 +134,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) @@ -173,8 +155,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) @@ -191,8 +171,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -206,8 +184,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -220,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -234,8 +208,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -248,8 +220,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -262,8 +232,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -276,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -290,8 +256,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -306,8 +270,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -320,8 +282,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -334,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -348,8 +306,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -362,8 +318,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -376,8 +330,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -390,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -404,8 +354,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index c595c939e8d1..cea848e72ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -325,8 +325,6 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <8 x i32>, ptr addrspace(4) %ptr, align 1 store <8 x i32> %load, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index 55ff6410c235..2c71366772fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -31,8 +31,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr(ptr addrspace(1) inreg %ptr) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_store_b32 v0, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm store i32 0, ptr addrspace(1) %ptr ret void @@ -65,8 +63,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_store_b32 v0, v0, s[2:3] offset:16380 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 store i32 0, ptr addrspace(1) %gep @@ -105,8 +101,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 store i32 0, ptr addrspace(1) %gep @@ -145,8 +139,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297 store i32 0, ptr addrspace(1) %gep @@ -180,8 +172,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: global_store_b32 v0, v0, s[2:3] offset:16384 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096 store i32 0, ptr addrspace(1) %gep @@ -213,8 +203,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 store i32 0, ptr addrspace(1) %gep @@ -248,8 +236,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 store i32 0, ptr addrspace(1) %gep @@ -283,8 +269,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297 store i32 0, ptr addrspace(1) %gep @@ -316,8 +300,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16384 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096 store i32 0, ptr addrspace(1) %gep @@ -362,8 +344,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: s_add_co_u32 s0, s2, s0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, s1 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset store i32 0, ptr addrspace(1) %gep @@ -402,8 +382,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset store i32 0, ptr addrspace(1) %gep @@ -442,8 +420,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 256 @@ -483,8 +459,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(ptr addrspace( ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 256 %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %soffset @@ -528,8 +502,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg % ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset store i32 0, ptr addrspace(1) %gep @@ -574,8 +546,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 4095 @@ -620,8 +590,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 4095 %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %voffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 489f46d1237a..b3b7457da64d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -39,8 +39,6 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v5, v7 ; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -93,8 +91,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -147,8 +143,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -195,8 +189,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -250,8 +242,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -300,8 +290,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -351,8 +339,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -384,8 +370,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -448,8 +432,6 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -481,8 +463,6 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -583,8 +563,6 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 42f1bf84c042..8cc4b7759c3a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2519,8 +2519,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_vregs: @@ -2529,8 +2527,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 @@ -2613,8 +2609,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_sregs: @@ -2629,8 +2623,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 @@ -2695,8 +2687,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] ; GFX11-NEXT: v_mov_b32_e32 v3, v4 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_sext_with_vregs: @@ -2705,8 +2695,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 @@ -2804,8 +2792,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_add_i32 s3, s4, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_sext_with_sregs: @@ -2820,8 +2806,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 183f2edbf903..adbe92fdbc62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -305,8 +305,6 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX11-NEXT: global_store_b32 v[2:3], v1, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -404,8 +402,6 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index e500aae7e0f3..9cf9839e69d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -42,8 +38,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -59,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -76,8 +68,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -93,8 +83,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -138,8 +122,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x half> %C @@ -153,8 +135,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) @@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -187,8 +165,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -204,8 +180,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -221,8 +195,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -238,8 +210,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -272,8 +240,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -289,8 +255,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -306,8 +270,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -323,8 +285,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B @@ -338,8 +298,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -353,8 +311,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B @@ -372,8 +328,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -388,8 +342,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) @@ -408,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %el3 = extractelement <8 x float> %C, i32 3 @@ -430,8 +380,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -445,8 +393,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -477,8 +423,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %C = load <16 x half>, ptr %Caddr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll index 3037c1ec2829..4959e10d2a18 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) @@ -38,8 +36,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> ) @@ -54,8 +50,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) @@ -84,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> ) @@ -98,8 +90,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) @@ -120,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) @@ -142,8 +130,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) @@ -164,8 +150,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) @@ -180,8 +164,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -210,8 +192,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -226,8 +206,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) @@ -256,8 +234,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) @@ -272,8 +248,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -302,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -318,8 +290,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -348,8 +318,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -364,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -394,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -410,8 +374,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -440,8 +402,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -456,8 +416,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -486,8 +444,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll index 086144873a04..22c61f992622 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -40,8 +36,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) @@ -74,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) @@ -90,8 +80,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -124,8 +110,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -140,8 +124,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -160,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -176,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -192,8 +170,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) @@ -210,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -226,8 +200,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -242,8 +214,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) @@ -260,8 +230,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -276,8 +244,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -292,8 +258,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll index a6e1f5ef12b4..7eafe53ea84c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -51,8 +49,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -77,8 +73,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -103,8 +97,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -135,8 +127,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -167,8 +157,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off ; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -199,8 +187,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -231,8 +217,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -263,8 +247,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -295,8 +277,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll index 3aa81da317d6..80497115e41b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C) @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, < ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) @@ -38,8 +34,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) @@ -52,8 +46,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) @@ -68,8 +60,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -84,8 +74,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) @@ -100,8 +88,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -116,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -132,8 +116,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -148,8 +130,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -164,8 +144,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -181,8 +159,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) @@ -197,8 +173,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) @@ -211,8 +185,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) @@ -225,8 +197,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) @@ -241,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -257,8 +225,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -273,8 +239,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -289,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -305,8 +267,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -321,8 +281,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -337,8 +295,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index 08a5d6660bab..be1761227f80 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -21,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -36,8 +32,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -51,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -66,8 +58,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -81,8 +71,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -96,8 +84,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -111,8 +97,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -126,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x half> %C @@ -141,8 +123,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) @@ -156,8 +136,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -171,8 +149,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -186,8 +162,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -201,8 +175,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -216,8 +188,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -231,8 +201,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -246,8 +214,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -261,8 +227,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -276,8 +240,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -291,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -306,8 +266,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -321,8 +279,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -338,8 +294,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -354,8 +308,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) @@ -372,8 +324,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %el3 = extractelement <4 x float> %C, i32 3 @@ -392,8 +342,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -407,8 +355,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -432,8 +378,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %C = load <8 x half>, ptr %Caddr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll index 738671cf7c64..173dd011f4d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) @@ -30,8 +28,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> ) @@ -44,8 +40,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) @@ -68,8 +62,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> ) @@ -82,8 +74,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) @@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) @@ -122,8 +110,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) @@ -142,8 +128,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) @@ -156,8 +140,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrsp ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -180,8 +162,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -194,8 +174,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -218,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -232,8 +208,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -256,8 +230,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -270,8 +242,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -294,8 +264,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -308,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -332,8 +298,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -346,8 +310,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -370,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> ) @@ -384,8 +344,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -408,8 +366,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll index 25f4145f9724..83bbf56daec2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -50,8 +44,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -64,8 +56,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -78,8 +68,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -94,8 +82,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -108,8 +94,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -122,8 +106,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -141,8 +123,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -155,8 +135,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -169,8 +147,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) @@ -185,8 +161,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -199,8 +173,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -213,8 +185,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) @@ -229,8 +199,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -243,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -257,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll index 87fe8334d059..1e9ef07ba754 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -27,8 +27,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -73,8 +71,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -113,8 +109,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -153,8 +147,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -199,8 +191,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -233,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -261,8 +249,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -301,8 +287,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -347,8 +331,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -393,8 +375,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -439,8 +419,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll index a03180f03952..f01679fb45d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C) @@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, < ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) @@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) @@ -48,8 +42,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) @@ -62,8 +54,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -76,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -90,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) @@ -104,8 +90,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) @@ -118,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) @@ -132,8 +114,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C) @@ -146,8 +126,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -160,8 +138,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) @@ -174,8 +150,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) @@ -188,8 +162,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) @@ -202,8 +174,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) @@ -216,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -230,8 +198,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -244,8 +210,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -258,8 +222,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -272,8 +234,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -286,8 +246,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -300,8 +258,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 94d704fa3f92..3c9d43a88a0f 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -67,8 +67,6 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_i32: @@ -81,8 +79,6 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in @@ -163,8 +159,6 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_v2i32: @@ -179,8 +173,6 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in @@ -280,8 +272,6 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_v4i32: @@ -299,8 +289,6 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in @@ -446,8 +434,6 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_v8i32: @@ -472,8 +458,6 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %0 = add <8 x i32> %a, %b @@ -719,8 +703,6 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_v16i32: @@ -760,8 +742,6 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %0 = add <16 x i32> %a, %b @@ -853,8 +833,6 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_add_i32: @@ -871,8 +849,6 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -955,8 +931,6 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_add_imm_i32: @@ -970,8 +944,6 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -1051,8 +1023,6 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: add64: @@ -1065,8 +1035,6 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -1157,8 +1125,6 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: add64_sgpr_vgpr: @@ -1173,8 +1139,6 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %0 = load i64, ptr addrspace(1) %in @@ -1303,8 +1267,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB9_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -1326,8 +1288,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB9_4: ; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 4cc384e9d271..b413e779dbaf 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -78,8 +78,6 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -154,8 +152,6 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %b = load <2 x i16>, ptr addrspace(4) %in1 @@ -215,8 +211,6 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %add = add <2 x i16> %a, %a @@ -269,8 +263,6 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %a, %b store <2 x i16> %add, ptr addrspace(1) %out @@ -334,8 +326,6 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -403,8 +393,6 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -470,8 +458,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, -1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -536,8 +522,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -603,8 +587,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -694,8 +676,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -792,8 +772,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid @@ -888,8 +866,6 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -994,8 +970,6 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 5cf9c9faa693..ad0babd74f9c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -185,8 +185,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: @@ -216,8 +214,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_constant: @@ -250,8 +246,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_constant: @@ -284,8 +278,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -471,8 +463,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: @@ -503,8 +493,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: @@ -538,8 +526,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_uniform: @@ -573,8 +559,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -831,8 +815,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: @@ -875,8 +857,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: @@ -923,8 +903,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: @@ -971,8 +949,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1245,8 +1221,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: @@ -1291,8 +1265,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: struct_add_i32_varying_vdata: @@ -1342,8 +1314,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: struct_add_i32_varying_vdata: @@ -1392,8 +1362,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1466,8 +1434,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: @@ -1481,8 +1447,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: @@ -1497,8 +1461,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: @@ -1512,8 +1474,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1697,8 +1657,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: @@ -1729,8 +1687,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_constant: @@ -1764,8 +1720,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_constant: @@ -1799,8 +1753,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -1987,8 +1939,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: @@ -2020,8 +1970,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: @@ -2056,8 +2004,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_uniform: @@ -2091,8 +2037,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -2349,8 +2293,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: @@ -2394,8 +2336,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: @@ -2442,8 +2382,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: @@ -2491,8 +2429,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2565,8 +2501,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: @@ -2580,8 +2514,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: @@ -2596,8 +2528,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: @@ -2611,8 +2541,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index d24eed841a9a..495bfec5454e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -219,8 +219,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_constant: @@ -255,8 +253,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i32_constant: @@ -294,8 +290,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i32_constant: @@ -333,8 +327,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel @@ -551,8 +543,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_uniform: @@ -589,8 +579,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i32_uniform: @@ -628,8 +616,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i32_uniform: @@ -666,8 +652,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3] ; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel @@ -953,8 +937,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: add_i32_varying: @@ -1004,8 +986,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX1264_ITERATIVE-LABEL: add_i32_varying: @@ -1057,8 +1037,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264_ITERATIVE-NEXT: s_nop 0 -; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_ITERATIVE-NEXT: s_endpgm ; ; GFX1232_ITERATIVE-LABEL: add_i32_varying: @@ -1111,8 +1089,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232_ITERATIVE-NEXT: s_nop 0 -; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: add_i32_varying: @@ -1412,8 +1388,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i32_varying: @@ -1471,8 +1445,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm ; ; GFX1264_DPP-LABEL: add_i32_varying: @@ -1545,8 +1517,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264_DPP-NEXT: s_nop 0 -; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: add_i32_varying: @@ -1605,8 +1575,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232_DPP-NEXT: s_nop 0 -; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1834,8 +1802,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_constant: @@ -1871,8 +1837,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_constant: @@ -1912,8 +1876,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i64_constant: @@ -1952,8 +1914,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel @@ -2214,8 +2174,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2260,8 +2218,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: @@ -2304,8 +2260,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] ; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i64_uniform: @@ -2346,8 +2300,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] ; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel @@ -2679,8 +2631,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: @@ -2736,8 +2686,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX1264_ITERATIVE-LABEL: add_i64_varying: @@ -2794,8 +2742,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1264_ITERATIVE-NEXT: s_nop 0 -; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_ITERATIVE-NEXT: s_endpgm ; ; GFX1232_ITERATIVE-LABEL: add_i64_varying: @@ -2849,8 +2795,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1232_ITERATIVE-NEXT: s_nop 0 -; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: add_i64_varying: @@ -3352,8 +3296,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i64_varying: @@ -3442,8 +3384,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm ; ; GFX1264_DPP-LABEL: add_i64_varying: @@ -3556,8 +3496,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null -; GFX1264_DPP-NEXT: s_nop 0 -; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: add_i64_varying: @@ -3647,8 +3585,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null -; GFX1232_DPP-NEXT: s_nop 0 -; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3861,8 +3797,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_constant: @@ -3898,8 +3832,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i32_constant: @@ -3938,8 +3870,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i32_constant: @@ -3978,8 +3908,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel @@ -4199,8 +4127,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_uniform: @@ -4238,8 +4164,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i32_uniform: @@ -4278,8 +4202,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i32_uniform: @@ -4317,8 +4239,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel @@ -4604,8 +4524,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: sub_i32_varying: @@ -4655,8 +4573,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX1264_ITERATIVE-LABEL: sub_i32_varying: @@ -4708,8 +4624,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264_ITERATIVE-NEXT: s_nop 0 -; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_ITERATIVE-NEXT: s_endpgm ; ; GFX1232_ITERATIVE-LABEL: sub_i32_varying: @@ -4762,8 +4676,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232_ITERATIVE-NEXT: s_nop 0 -; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: @@ -5063,8 +4975,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i32_varying: @@ -5122,8 +5032,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm ; ; GFX1264_DPP-LABEL: sub_i32_varying: @@ -5196,8 +5104,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264_DPP-NEXT: s_nop 0 -; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: sub_i32_varying: @@ -5256,8 +5162,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232_DPP-NEXT: s_nop 0 -; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -5497,8 +5401,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i64_constant: @@ -5537,8 +5439,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i64_constant: @@ -5581,8 +5481,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i64_constant: @@ -5624,8 +5522,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel @@ -5897,8 +5793,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i64_uniform: @@ -5945,8 +5839,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i64_uniform: @@ -5993,8 +5885,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v1, v4 ; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i64_uniform: @@ -6039,8 +5929,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v1, v4 ; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel @@ -6372,8 +6260,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: @@ -6429,8 +6315,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX1264_ITERATIVE-LABEL: sub_i64_varying: @@ -6487,8 +6371,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1264_ITERATIVE-NEXT: s_nop 0 -; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_ITERATIVE-NEXT: s_endpgm ; ; GFX1232_ITERATIVE-LABEL: sub_i64_varying: @@ -6542,8 +6424,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX1232_ITERATIVE-NEXT: s_nop 0 -; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: @@ -7045,8 +6925,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i64_varying: @@ -7135,8 +7013,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm ; ; GFX1264_DPP-LABEL: sub_i64_varying: @@ -7249,8 +7125,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc ; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null -; GFX1264_DPP-NEXT: s_nop 0 -; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264_DPP-NEXT: s_endpgm ; ; GFX1232_DPP-LABEL: sub_i64_varying: @@ -7340,8 +7214,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo ; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null -; GFX1232_DPP-NEXT: s_nop 0 -; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index ce90fbed8131..5924649ebe73 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -193,8 +193,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_constant: @@ -224,8 +222,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var32, i32 5 acq_rel @@ -420,8 +416,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_uniform: @@ -453,8 +447,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel @@ -713,8 +705,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: add_i32_varying: @@ -758,8 +748,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: add_i32_varying: @@ -1017,8 +1005,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i32_varying: @@ -1067,8 +1053,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1674,8 +1658,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_constant: @@ -1707,8 +1689,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -1944,8 +1924,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1984,8 +1962,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel @@ -2290,8 +2266,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: @@ -2342,8 +2316,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: add_i64_varying: @@ -2814,8 +2786,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i64_varying: @@ -2897,8 +2867,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3698,8 +3666,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_constant: @@ -3730,8 +3696,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 5 acq_rel @@ -3927,8 +3891,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_uniform: @@ -3961,8 +3923,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %subitive acq_rel @@ -4221,8 +4181,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: sub_i32_varying: @@ -4266,8 +4224,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: @@ -4525,8 +4481,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i32_varying: @@ -4575,8 +4529,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -5193,8 +5145,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i64_constant: @@ -5229,8 +5179,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -5476,8 +5424,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i64_uniform: @@ -5518,8 +5464,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel @@ -5824,8 +5768,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: @@ -5876,8 +5818,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: @@ -6348,8 +6288,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i64_varying: @@ -6431,8 +6369,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -6693,8 +6629,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: and_i32_varying: @@ -6738,8 +6672,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: and_i32_varying: @@ -6997,8 +6929,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i32_varying: @@ -7047,8 +6977,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -7343,8 +7271,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: and_i64_varying: @@ -7392,8 +7318,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: and_i64_varying: @@ -7739,8 +7663,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: @@ -7808,8 +7730,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -8070,8 +7990,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: or_i32_varying: @@ -8115,8 +8033,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: or_i32_varying: @@ -8374,8 +8290,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i32_varying: @@ -8424,8 +8338,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -8720,8 +8632,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: or_i64_varying: @@ -8769,8 +8679,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: or_i64_varying: @@ -9118,8 +9026,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: @@ -9187,8 +9093,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -9449,8 +9353,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: xor_i32_varying: @@ -9494,8 +9396,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: xor_i32_varying: @@ -9753,8 +9653,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i32_varying: @@ -9803,8 +9701,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -10099,8 +9995,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: xor_i64_varying: @@ -10148,8 +10042,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: xor_i64_varying: @@ -10497,8 +10389,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: @@ -10566,8 +10456,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -10828,8 +10716,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: max_i32_varying: @@ -10873,8 +10759,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: max_i32_varying: @@ -11132,8 +11016,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i32_varying: @@ -11182,8 +11064,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -11390,8 +11270,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: max_i64_constant: @@ -11423,8 +11301,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -11763,8 +11639,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: max_i64_varying: @@ -11820,8 +11694,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: max_i64_varying: @@ -12346,8 +12218,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i64_varying: @@ -12437,8 +12307,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -12699,8 +12567,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: min_i32_varying: @@ -12744,8 +12610,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: min_i32_varying: @@ -13003,8 +12867,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i32_varying: @@ -13053,8 +12915,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -13261,8 +13121,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: min_i64_constant: @@ -13294,8 +13152,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -13634,8 +13490,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: min_i64_varying: @@ -13691,8 +13545,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: min_i64_varying: @@ -14210,8 +14062,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i64_varying: @@ -14300,8 +14150,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -14562,8 +14410,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umax_i32_varying: @@ -14607,8 +14453,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umax_i32_varying: @@ -14866,8 +14710,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i32_varying: @@ -14916,8 +14758,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -15121,8 +14961,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: umax_i64_constant: @@ -15154,8 +14992,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw umax ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -15488,8 +15324,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umax_i64_varying: @@ -15544,8 +15378,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umax_i64_varying: @@ -16056,8 +15888,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i64_varying: @@ -16146,8 +15976,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -16408,8 +16236,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umin_i32_varying: @@ -16453,8 +16279,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umin_i32_varying: @@ -16712,8 +16536,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s0, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i32_varying: @@ -16762,8 +16584,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s0, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -16967,8 +16787,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: umin_i64_constant: @@ -17000,8 +16818,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw umin ptr addrspace(3) @local_var64, i64 5 acq_rel @@ -17334,8 +17150,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164_ITERATIVE-NEXT: s_nop 0 -; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umin_i64_varying: @@ -17390,8 +17204,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132_ITERATIVE-NEXT: s_nop 0 -; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umin_i64_varying: @@ -17900,8 +17712,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1164_DPP-NEXT: s_nop 0 -; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i64_varying: @@ -17990,8 +17800,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 -; GFX1132_DPP-NEXT: s_nop 0 -; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 429e6c489bf6..4ae08a0375c8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -193,8 +193,6 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_constant: @@ -234,8 +232,6 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) @@ -552,8 +548,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying: @@ -612,8 +606,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 9d4dfd891125..a1f7d2ca3d33 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -184,8 +184,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: @@ -215,8 +213,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_constant: @@ -249,8 +245,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_constant: @@ -283,8 +277,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -470,8 +462,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: @@ -502,8 +492,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: @@ -537,8 +525,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_uniform: @@ -572,8 +558,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -830,8 +814,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: @@ -874,8 +856,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: @@ -922,8 +902,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: @@ -970,8 +948,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1044,8 +1020,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: @@ -1059,8 +1033,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: @@ -1075,8 +1047,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: @@ -1090,8 +1060,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1275,8 +1243,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: @@ -1307,8 +1273,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_constant: @@ -1342,8 +1306,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_constant: @@ -1377,8 +1339,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -1565,8 +1525,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: @@ -1598,8 +1556,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: @@ -1634,8 +1590,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_uniform: @@ -1669,8 +1623,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0) @@ -1927,8 +1879,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: @@ -1972,8 +1922,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: @@ -2020,8 +1968,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: @@ -2069,8 +2015,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2143,8 +2087,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: @@ -2158,8 +2100,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: @@ -2174,8 +2114,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: @@ -2189,8 +2127,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 3fb44e090c61..d0987068841b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -190,8 +190,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: @@ -222,8 +220,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_constant: @@ -257,8 +253,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_constant: @@ -291,8 +285,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) @@ -484,8 +476,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: @@ -517,8 +507,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: @@ -553,8 +541,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_uniform: @@ -588,8 +574,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) @@ -852,8 +836,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: @@ -896,8 +878,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: @@ -945,8 +925,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: @@ -993,8 +971,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1067,8 +1043,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vindex: @@ -1082,8 +1056,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vindex: @@ -1098,8 +1070,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vindex: @@ -1113,8 +1083,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1199,8 +1167,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: @@ -1216,8 +1182,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: @@ -1232,8 +1196,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: @@ -1247,8 +1209,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1438,8 +1398,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: @@ -1471,8 +1429,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_constant: @@ -1507,8 +1463,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_constant: @@ -1542,8 +1496,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) @@ -1736,8 +1688,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: @@ -1770,8 +1720,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: @@ -1807,8 +1755,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_uniform: @@ -1842,8 +1788,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0) @@ -2106,8 +2050,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: @@ -2151,8 +2093,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: @@ -2200,8 +2140,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: @@ -2249,8 +2187,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2323,8 +2259,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vindex: @@ -2338,8 +2272,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vindex: @@ -2354,8 +2286,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vindex: @@ -2369,8 +2299,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2455,8 +2383,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: @@ -2472,8 +2398,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: @@ -2488,8 +2412,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: @@ -2503,8 +2425,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 417d38990505..c630effa4b04 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -118,8 +118,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: @@ -128,8 +126,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 @@ -148,8 +144,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: @@ -162,8 +156,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 6f52da2631b8..b281c1bf3f9c 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -69,8 +69,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_i16: @@ -87,8 +85,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, ptr addrspace(1) %out @@ -162,8 +158,6 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i16: @@ -175,8 +169,6 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 @@ -233,8 +225,6 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_i32: @@ -248,8 +238,6 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, ptr addrspace(1) %out @@ -322,8 +310,6 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i32: @@ -337,8 +323,6 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -405,8 +389,6 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; GFX11-FLAT-NEXT: s_mov_b32 s4, s0 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_v2i32: @@ -419,8 +401,6 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, ptr addrspace(1) %out @@ -497,8 +477,6 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_v2i32: @@ -514,8 +492,6 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -575,8 +551,6 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_i64: @@ -588,8 +562,6 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, ptr addrspace(1) %out @@ -666,8 +638,6 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i64: @@ -683,8 +653,6 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid @@ -756,8 +724,6 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_v2i64: @@ -772,8 +738,6 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[8:9] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, ptr addrspace(1) %out @@ -858,8 +822,6 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0 -; GFX11-FLAT-NEXT: s_nop 0 -; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_v2i64: @@ -877,8 +839,6 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 857b13fab8a7..afff56c7f0eb 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -82,13 +82,9 @@ define amdgpu_kernel void @br_cc_f16( ; GFX11-NEXT: s_cbranch_vccnz .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %one ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB0_2: ; %two ; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -173,8 +169,6 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; GFX11-NEXT: s_mov_b32 s2, s6 ; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { @@ -259,8 +253,6 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; GFX11-NEXT: s_mov_b32 s2, s6 ; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index bb642afecc9c..77f1bc2a172a 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -73,8 +73,6 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: uniform_conditional_max_short_forward_branch: @@ -104,8 +102,6 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %cmp = icmp eq i32 %cnd, 0 @@ -183,8 +179,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: uniform_conditional_min_long_forward_branch: @@ -214,8 +208,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb0: %cmp = icmp eq i32 %cnd, 0 @@ -298,8 +290,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: uniform_conditional_min_long_forward_vcnd_branch: @@ -333,8 +323,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb0: %cmp = fcmp oeq float %cnd, 0.0 @@ -426,8 +414,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: min_long_forward_vbranch: @@ -465,8 +451,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -646,16 +630,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], -1 -; GFX11-NEXT: s_cbranch_scc0 .LBB5_1 -; GFX11-NEXT: ; %bb.7: ; %bb0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc6: -; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32 -; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_setpc_b64 s[0:1] -; GFX11-NEXT: .LBB5_1: ; %Flow +; GFX11-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow ; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[0:1] ; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 ; GFX11-NEXT: .LBB5_2: ; %bb2 @@ -669,8 +645,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB5_4: ; %bb3 ; GFX11-NEXT: ;;#ASMSTART @@ -680,12 +654,12 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX11-NEXT: v_nop_e64 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_cbranch_execnz .LBB5_5 -; GFX11-NEXT: ; %bb.9: ; %bb3 +; GFX11-NEXT: ; %bb.7: ; %bb3 ; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc7: +; GFX11-NEXT: .Lpost_getpc6: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32 +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[0:1] ; GFX11-NEXT: .LBB5_5: ; %bb3 @@ -725,8 +699,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB5_4: ; %bb3 ; GFX12-NEXT: ;;#ASMSTART @@ -814,10 +786,10 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GFX11-NEXT: ; %bb.3: ; %loop ; GFX11-NEXT: ; in Loop: Header=BB6_1 Depth=1 ; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc8: +; GFX11-NEXT: .Lpost_getpc7: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32 +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[0:1] ; GFX11-NEXT: .LBB6_2: ; %DummyReturnBlock @@ -921,10 +893,10 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GFX11-NEXT: s_cbranch_vccz .LBB7_3 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc9: +; GFX11-NEXT: .Lpost_getpc8: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32 +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[0:1] ; GFX11-NEXT: .LBB7_3: ; %bb2 @@ -1067,8 +1039,6 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: .LBB8_3: ; %endif ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: s_sleep 5 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: uniform_inside_divergent: @@ -1091,8 +1061,6 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: .LBB8_3: ; %endif ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_sleep 5 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1186,10 +1154,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GFX11-NEXT: s_cbranch_execnz .LBB9_3 ; GFX11-NEXT: ; %bb.6: ; %Flow1 ; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc10: +; GFX11-NEXT: .Lpost_getpc9: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc9)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc9)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[0:1] ; GFX11-NEXT: .LBB9_3: ; %loop.preheader @@ -1211,10 +1179,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GFX11-NEXT: ; %bb.8: ; %loop ; GFX11-NEXT: ; in Loop: Header=BB9_4 Depth=1 ; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: .Lpost_getpc11: +; GFX11-NEXT: .Lpost_getpc10: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 -; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc10)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc10)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[0:1] ; GFX11-NEXT: .LBB9_5: ; %UnifiedReturnBlock @@ -1374,10 +1342,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.8: ; %bb ; GFX11-NEXT: s_getpc_b64 s[8:9] -; GFX11-NEXT: .Lpost_getpc12: +; GFX11-NEXT: .Lpost_getpc11: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 -; GFX11-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GFX11-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295 +; GFX11-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[8:9] ; GFX11-NEXT: .LBB10_1: ; %bb13 @@ -1426,8 +1394,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: s_add_u32 s0, s2, s0 ; GFX11-NEXT: s_addc_u32 s1, s3, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: long_branch_hang: @@ -1537,8 +1503,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %tmp = icmp slt i32 %arg2, 9 diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 321a7ceb826f..3c48a0f0dcab 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -57,8 +57,6 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone @@ -112,8 +110,6 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) %in, align 8 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone @@ -177,8 +173,6 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) %in, align 16 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone @@ -266,8 +260,6 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x i32>, ptr addrspace(1) %in, align 32 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone @@ -321,8 +313,6 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone @@ -386,8 +376,6 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone @@ -475,8 +463,6 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x i64>, ptr addrspace(1) %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 13c4ff8b2ff3..dc9ce68a4a68 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -46,8 +46,6 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX11-NEXT: v_mov_b32_e32 v1, 6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_vector2: @@ -113,8 +111,6 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX11-NEXT: v_mov_b32_e32 v3, 8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_vector4: @@ -170,8 +166,6 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_vector_v2i16: @@ -236,8 +230,6 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_vector_v2i16_trunc: @@ -310,8 +302,6 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 29770738f83d..b334047d3255 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -32,8 +32,6 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: store i32 0, ptr addrspace(1) %out @@ -487,8 +485,6 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef @@ -524,8 +520,6 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef @@ -648,8 +642,6 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef @@ -682,8 +674,6 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { ; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef @@ -739,8 +729,6 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef @@ -782,8 +770,6 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef @@ -816,8 +802,6 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { ; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef @@ -849,8 +833,6 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { ; GFX11-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef @@ -892,8 +874,6 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef @@ -933,8 +913,6 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef @@ -960,8 +938,6 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef @@ -993,8 +969,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef @@ -1038,8 +1012,6 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add i8 %arg0, %arg0 @@ -1096,8 +1068,6 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <2 x i8> %arg0, %arg0 @@ -1184,8 +1154,6 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <4 x i8> %arg0, %arg0 @@ -1259,8 +1227,6 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b16 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <3 x i8> %arg0, %arg0 @@ -1362,8 +1328,6 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <5 x i8> %arg0, %arg0 @@ -1500,8 +1464,6 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <8 x i8> %arg0, %arg0 @@ -1738,8 +1700,6 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <16 x i8> %arg0, %arg0 @@ -2187,8 +2147,6 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add <32 x i8> %arg0, %arg0 @@ -2215,8 +2173,6 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) undef ret void @@ -2305,8 +2261,6 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <8 x i1> %arg0, ptr addrspace(1) undef ret void @@ -2467,8 +2421,6 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <16 x i1> %arg0, ptr addrspace(1) undef ret void @@ -2768,8 +2720,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <32 x i1> %arg0, ptr addrspace(1) undef ret void @@ -2798,8 +2748,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) undef ret void @@ -2888,8 +2836,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) { ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <8 x i1> %arg0, ptr addrspace(1) undef ret void @@ -3050,8 +2996,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) { ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <16 x i1> %arg0, ptr addrspace(1) undef ret void @@ -3356,8 +3300,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) { ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <32 x i1> %arg0, ptr addrspace(1) undef ret void @@ -3382,8 +3324,6 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) undef ret void @@ -3405,8 +3345,6 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) { ; GFX11-LABEL: amdgpu_cs_i1_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 0cc10512af5c..8352376a9c13 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -112,8 +112,6 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -212,8 +210,6 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %add = add i64 20015998343286, %a @@ -304,8 +300,6 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -397,8 +391,6 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -494,8 +486,6 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -616,8 +606,6 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -753,8 +741,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -888,8 +874,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1003,8 +987,6 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %sub = sub i64 %a, %b @@ -1103,8 +1085,6 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %sub = sub i64 20015998343286, %a @@ -1195,8 +1175,6 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1288,8 +1266,6 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1386,8 +1362,6 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -1508,8 +1482,6 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -1645,8 +1617,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 @@ -1780,8 +1750,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2940,8 +2908,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 3fc5d0d4b279..78f61ad906ce 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -581,8 +581,6 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index bafbc9486a1c..b17e1a080741 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -60,8 +60,6 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -143,8 +141,6 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -214,8 +210,6 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -289,8 +283,6 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_floor_f32_e32 v1, v1 ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -364,8 +356,6 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -433,8 +423,6 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -505,8 +493,6 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -577,8 +563,6 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -651,8 +635,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x float>, ptr addrspace(1) %aptr, i32 %tid @@ -721,8 +703,6 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -802,8 +782,6 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -886,8 +864,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -967,8 +943,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1060,8 +1034,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1146,8 +1118,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1234,8 +1204,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1320,8 +1288,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1404,8 +1370,6 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1491,8 +1455,6 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -1579,8 +1541,6 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 94482d7f0d80..311feafe3f43 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -61,8 +61,6 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f32: @@ -76,8 +74,6 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -146,8 +142,6 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neg_f32: @@ -161,8 +155,6 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -232,8 +224,6 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negabs_f32: @@ -247,8 +237,6 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -327,8 +315,6 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negzero_f32: @@ -343,8 +329,6 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -423,8 +407,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: @@ -439,8 +421,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -527,8 +507,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_multi_use_max_f32: @@ -548,8 +526,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -620,8 +596,6 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f16: @@ -635,8 +609,6 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -706,8 +678,6 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neg_f16: @@ -721,8 +691,6 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -793,8 +761,6 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negabs_f16: @@ -808,8 +774,6 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -881,8 +845,6 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f64: @@ -896,8 +858,6 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -966,8 +926,6 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neg_f64: @@ -981,8 +939,6 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -1052,8 +1008,6 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negabs_f64: @@ -1067,8 +1021,6 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -1143,8 +1095,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: @@ -1158,8 +1108,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1226,8 +1174,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_aby_f32: @@ -1241,8 +1187,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1309,8 +1253,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_bay_f32: @@ -1324,8 +1266,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1392,8 +1332,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_yab_f32: @@ -1407,8 +1345,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1475,8 +1411,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_yba_f32: @@ -1490,8 +1424,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1558,8 +1490,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_ayb_f32: @@ -1573,8 +1503,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1641,8 +1569,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_bya_f32: @@ -1656,8 +1582,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1710,8 +1634,6 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constants_to_one_f32: @@ -1722,8 +1644,6 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1773,8 +1693,6 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constants_to_zero_f32: @@ -1785,8 +1703,6 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1837,8 +1753,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_preserve_f32: @@ -1849,8 +1763,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1901,8 +1813,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: @@ -1913,8 +1823,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1964,8 +1872,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_qnan_f32: @@ -1976,8 +1882,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -2027,8 +1931,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_snan_f32: @@ -2039,8 +1941,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -2113,8 +2013,6 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: @@ -2128,8 +2026,6 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2199,8 +2095,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: @@ -2214,8 +2108,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2289,8 +2181,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: @@ -2304,8 +2194,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2378,8 +2266,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: @@ -2393,8 +2279,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2464,8 +2348,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: @@ -2479,8 +2361,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2547,8 +2427,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: @@ -2562,8 +2440,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2630,8 +2506,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: @@ -2645,8 +2519,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2713,8 +2585,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: @@ -2728,8 +2598,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2796,8 +2664,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: @@ -2811,8 +2677,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2879,8 +2743,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: @@ -2894,8 +2756,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2948,8 +2808,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: @@ -2960,8 +2818,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -3012,8 +2868,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: @@ -3024,8 +2878,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -3098,8 +2950,6 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16: @@ -3113,8 +2963,6 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3200,8 +3048,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_undef_elt: @@ -3215,8 +3061,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3302,8 +3146,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_not_zero: @@ -3320,8 +3162,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3406,8 +3246,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_not_one: @@ -3424,8 +3262,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3503,8 +3339,6 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neg_v2f16: @@ -3518,8 +3352,6 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3600,8 +3432,6 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_negabs_v2f16: @@ -3616,8 +3446,6 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3698,8 +3526,6 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neglo_v2f16: @@ -3713,8 +3539,6 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3794,8 +3618,6 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_neghi_v2f16: @@ -3809,8 +3631,6 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3890,8 +3710,6 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_shuffle: @@ -3905,8 +3723,6 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -3993,8 +3809,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: @@ -4008,8 +3822,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -4095,8 +3907,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: @@ -4110,8 +3920,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid @@ -4191,8 +3999,6 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_clamp_diff_source_f32: @@ -4208,8 +4014,6 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: s_max_num_f32 s2, s2, s3 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm { %gep1 = getelementptr float, ptr addrspace(1) %aptr, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index bada3d904fbe..fb3de211eaeb 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -318,8 +318,6 @@ define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg ; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9 ; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %x1 = add i32 %x, 1 @@ -374,8 +372,6 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in ; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0) @@ -464,8 +460,6 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre ; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %s = sitofp i32 %x to float diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll index a33b6cb5cf40..cc29152b3602 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll @@ -11,8 +11,6 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_olt_to_ogt: @@ -24,8 +22,6 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp olt float 2.0, %a @@ -43,8 +39,6 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_ogt_to_olt: @@ -56,8 +50,6 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ogt float 2.0, %a @@ -75,8 +67,6 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_ole_to_oge: @@ -88,8 +78,6 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ole float 2.0, %a @@ -107,8 +95,6 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_oge_to_ole: @@ -120,8 +106,6 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oge float 2.0, %a @@ -139,8 +123,6 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_ult_to_ugt: @@ -152,8 +134,6 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ult float 2.0, %a @@ -171,8 +151,6 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_ugt_to_ult: @@ -184,8 +162,6 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ugt float 2.0, %a @@ -203,8 +179,6 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_ule_to_uge: @@ -216,8 +190,6 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ule float 2.0, %a @@ -235,8 +207,6 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f32_uge_to_ule: @@ -248,8 +218,6 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uge float 2.0, %a @@ -267,8 +235,6 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_olt_to_ogt: @@ -280,8 +246,6 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp olt half 2.0, %a @@ -299,8 +263,6 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_ogt_to_olt: @@ -312,8 +274,6 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ogt half 2.0, %a @@ -331,8 +291,6 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_ole_to_oge: @@ -344,8 +302,6 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ole half 2.0, %a @@ -363,8 +319,6 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_oge_to_ole: @@ -376,8 +330,6 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oge half 2.0, %a @@ -395,8 +347,6 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_ult_to_ugt: @@ -408,8 +358,6 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ult half 2.0, %a @@ -427,8 +375,6 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_f16_ugt_to_ult: @@ -440,8 +386,6 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ugt half 2.0, %a @@ -459,8 +403,6 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg % ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_ule_to_uge: @@ -472,8 +414,6 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg % ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ule half 2.0, %a @@ -491,8 +431,6 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg % ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: fcmp_uge_to_ule: @@ -504,8 +442,6 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg % ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uge half 2.0, %a diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 93e14a205f05..a7522ef761b8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -96,8 +96,6 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -200,8 +198,6 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -322,8 +318,6 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -471,8 +465,6 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -591,8 +583,6 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone @@ -686,8 +676,6 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-NEXT: s_min_u32 s0, s0, 64 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, ptr addrspace(1) %out @@ -771,8 +759,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; GFX11-NEXT: s_min_u32 s2, s2, 64 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) %trunc = trunc i64 %ctlz to i32 @@ -897,8 +883,6 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1026,8 +1010,6 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1135,8 +1117,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1244,8 +1224,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1369,8 +1347,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1493,8 +1469,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1607,8 +1581,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1735,8 +1707,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone @@ -1855,8 +1825,6 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index f3f749b5c054..850e701513fd 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -962,8 +962,6 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid @@ -1050,8 +1048,6 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid @@ -1143,8 +1139,6 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid @@ -1241,8 +1235,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid @@ -1380,8 +1372,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid @@ -1553,8 +1543,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid @@ -1748,8 +1736,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: global_store_b32 v4, v5, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x @@ -1937,8 +1923,6 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid @@ -2061,8 +2045,6 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid @@ -2148,8 +2130,6 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2232,8 +2212,6 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2312,8 +2290,6 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid @@ -2448,8 +2424,6 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid @@ -2531,8 +2505,6 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2614,8 +2586,6 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2698,8 +2668,6 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2782,8 +2750,6 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid @@ -2879,8 +2845,6 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -3021,8 +2985,6 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; GFX11-NEXT: global_store_b8 v[0:1], v3, off ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: global_store_b8 v[0:1], v1, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: br label %for.body.i diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 5fae60a7acac..f139943ff2bc 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -49,8 +49,6 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 0, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 @@ -134,8 +132,6 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 @@ -219,8 +215,6 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 @@ -402,8 +396,6 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 @@ -492,8 +484,6 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shift_a = lshr i32 %a, 16 %tr_a = trunc i32 %shift_a to i16 @@ -730,8 +720,6 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %load = load i16, ptr addrspace(3) %in diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 41a9d7999e80..352d55073d67 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -120,8 +120,6 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -197,8 +195,6 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 -1, %x.i @@ -602,8 +598,6 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 54fb1dc5c052..d6bde7980284 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -194,8 +194,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: %i1 = add i32 %i, %i diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index f4ec16db55d6..858eaace8dcb 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -47,8 +47,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 @@ -107,8 +105,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx @@ -178,8 +174,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -234,8 +228,6 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 @@ -290,8 +282,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 @@ -347,8 +337,6 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -422,8 +410,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -486,8 +472,6 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt0 = extractelement <16 x half> %load, i32 0 @@ -547,8 +531,6 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt2 = extractelement <16 x half> %load, i32 2 @@ -696,8 +678,6 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -952,8 +932,6 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 21799ab79b83..986f27b19dba 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -54,8 +54,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -109,8 +107,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, ptr addrspace(1) %out @@ -163,8 +159,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, ptr addrspace(1) %out @@ -220,8 +214,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, ptr addrspace(1) %out @@ -281,8 +273,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 @@ -341,8 +331,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -399,8 +387,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) @@ -481,8 +467,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -571,8 +555,6 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -660,8 +642,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -742,8 +722,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index 1094b768f1bd..e3c9e8ccdc39 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -80,8 +80,6 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_f16: @@ -102,8 +100,6 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16: @@ -128,8 +124,6 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16: @@ -150,8 +144,6 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; %entry @@ -246,8 +238,6 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_f16_imm_a: @@ -262,8 +252,6 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a: @@ -282,8 +270,6 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a: @@ -298,8 +284,6 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_f16_imm_a: ; GFX11: ; %bb.0: ; %entry @@ -386,8 +370,6 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_f16_imm_b: @@ -402,8 +384,6 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b: @@ -422,8 +402,6 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 2.0, v0 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b: @@ -438,8 +416,6 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 2.0, v0 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_f16_imm_b: ; GFX11: ; %bb.0: ; %entry @@ -550,8 +526,6 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v1, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_v2f16: @@ -571,8 +545,6 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v1, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16: @@ -594,8 +566,6 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, v1, v0 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16: @@ -615,8 +585,6 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, v1, v0 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: ; %entry @@ -716,8 +684,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a: @@ -733,8 +699,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a: @@ -752,8 +716,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a: @@ -769,8 +731,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry @@ -863,8 +823,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b: @@ -880,8 +838,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b: @@ -899,8 +855,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-SDAG-NEXT: s_nop 0 -; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b: @@ -916,8 +870,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-GISEL-NEXT: s_nop 0 -; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 7252c69cb1cf..dddd649888af 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -121,8 +121,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_2xi16_align2: @@ -131,8 +129,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 2 @@ -278,8 +274,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_2xi16_align1: @@ -288,8 +282,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 1 @@ -400,8 +392,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_2xi16_align4: @@ -410,8 +400,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 store i16 1, ptr addrspace(1) %r, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7af972b96ec6..26580618794d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -53,8 +53,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half undef) store half %canonicalized, ptr addrspace(1) %out @@ -107,8 +105,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %canonicalized = call half @llvm.canonicalize.f16(half %val) @@ -160,8 +156,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) @@ -254,8 +248,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -310,8 +302,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -367,8 +357,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val @@ -423,8 +411,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val @@ -479,8 +465,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -525,8 +509,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, ptr addrspace(1) %out @@ -569,8 +551,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, ptr addrspace(1) %out @@ -613,8 +593,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, ptr addrspace(1) %out @@ -657,8 +635,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, ptr addrspace(1) %out @@ -701,8 +677,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, ptr addrspace(1) %out @@ -745,8 +719,6 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out @@ -789,8 +761,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out @@ -833,8 +803,6 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out @@ -877,8 +845,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out @@ -921,8 +887,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, ptr addrspace(1) %out @@ -965,8 +929,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, ptr addrspace(1) %out @@ -1009,8 +971,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, ptr addrspace(1) %out @@ -1053,8 +1013,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, ptr addrspace(1) %out @@ -1097,8 +1055,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, ptr addrspace(1) %out @@ -1141,8 +1097,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, ptr addrspace(1) %out @@ -1185,8 +1139,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, ptr addrspace(1) %out @@ -1258,8 +1210,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid @@ -1336,8 +1286,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid @@ -1416,8 +1364,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid @@ -1495,8 +1441,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid @@ -1560,8 +1504,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) @@ -1604,8 +1546,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1648,8 +1588,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1692,8 +1630,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1736,8 +1672,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1780,8 +1714,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1824,8 +1756,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1868,8 +1798,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1912,8 +1840,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1956,8 +1882,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2000,8 +1924,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2044,8 +1966,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2088,8 +2008,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2132,8 +2050,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2176,8 +2092,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2220,8 +2134,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2264,8 +2176,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2387,8 +2297,6 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -2694,8 +2602,6 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) store <4 x half> %canonicalized, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index f0ce96af9064..adf1635b2914 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -53,8 +53,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_var_f32: @@ -66,8 +64,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %canonicalized = call float @llvm.canonicalize.f32(float %val) @@ -117,8 +113,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_test_canonicalize_var_f32: @@ -128,8 +122,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, ptr addrspace(1) %out @@ -169,8 +161,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: @@ -182,8 +172,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -225,8 +213,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: @@ -238,8 +224,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -282,8 +266,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: @@ -295,8 +277,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fneg = fneg float %val @@ -330,8 +310,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: @@ -340,8 +318,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float undef) store float %canonicalized, ptr addrspace(1) %out @@ -373,8 +349,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: @@ -383,8 +357,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, ptr addrspace(1) %out @@ -418,8 +390,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: @@ -429,8 +399,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, ptr addrspace(1) %out @@ -463,8 +431,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: @@ -473,8 +439,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, ptr addrspace(1) %out @@ -507,8 +471,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: @@ -517,8 +479,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, ptr addrspace(1) %out @@ -551,8 +511,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: @@ -561,8 +519,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, ptr addrspace(1) %out @@ -594,8 +550,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: @@ -604,8 +558,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -641,8 +593,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: @@ -652,8 +602,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -689,8 +637,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: @@ -700,8 +646,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -737,8 +681,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: @@ -748,8 +690,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -782,8 +722,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: @@ -792,8 +730,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -827,8 +763,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: @@ -838,8 +772,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -872,8 +804,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: @@ -882,8 +812,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -916,8 +844,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: @@ -926,8 +852,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, ptr addrspace(1) %out @@ -960,8 +884,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: @@ -970,8 +892,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1004,8 +924,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: @@ -1014,8 +932,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1048,8 +964,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: @@ -1058,8 +972,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1092,8 +1004,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: @@ -1102,8 +1012,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1136,8 +1044,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: @@ -1146,8 +1052,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1180,8 +1084,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: @@ -1190,8 +1092,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, ptr addrspace(1) %out @@ -1231,8 +1131,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_var_f64: @@ -1244,8 +1142,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %canonicalized = call double @llvm.canonicalize.f64(double %val) @@ -1290,8 +1186,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_test_canonicalize_var_f64: @@ -1301,8 +1195,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, ptr addrspace(1) %out @@ -1342,8 +1234,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: @@ -1355,8 +1245,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], |v[0:1]|, |v[0:1]| ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -1398,8 +1286,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: @@ -1411,8 +1297,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -1455,8 +1339,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: @@ -1468,8 +1350,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fneg = fneg double %val @@ -1507,8 +1387,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: @@ -1519,8 +1397,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, ptr addrspace(1) %out @@ -1555,8 +1431,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: @@ -1566,8 +1440,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, ptr addrspace(1) %out @@ -1601,8 +1473,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: @@ -1611,8 +1481,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, ptr addrspace(1) %out @@ -1646,8 +1514,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: @@ -1656,8 +1522,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, ptr addrspace(1) %out @@ -1691,8 +1555,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: @@ -1701,8 +1563,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, ptr addrspace(1) %out @@ -1738,8 +1598,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: @@ -1750,8 +1608,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -1787,8 +1643,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: @@ -1798,8 +1652,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -1834,8 +1686,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: @@ -1845,8 +1695,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -1882,8 +1730,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: @@ -1893,8 +1739,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -1928,8 +1772,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: @@ -1938,8 +1780,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, ptr addrspace(1) %out @@ -1973,8 +1813,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: @@ -1983,8 +1821,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2018,8 +1854,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: @@ -2028,8 +1862,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2063,8 +1895,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: @@ -2073,8 +1903,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2108,8 +1936,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: @@ -2118,8 +1944,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2153,8 +1977,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: @@ -2163,8 +1985,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2198,8 +2018,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: @@ -2208,8 +2026,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, ptr addrspace(1) %out @@ -2273,8 +2089,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: @@ -2288,8 +2102,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id @@ -2357,8 +2169,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: @@ -2372,8 +2182,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id @@ -2442,8 +2250,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: @@ -2457,8 +2263,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id @@ -2536,8 +2340,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: @@ -2551,8 +2353,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id @@ -2620,8 +2420,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: @@ -2635,8 +2433,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id @@ -2704,8 +2500,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: @@ -2719,8 +2513,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id @@ -2790,8 +2582,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: @@ -2805,8 +2595,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id @@ -2884,8 +2672,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: @@ -2899,8 +2685,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id @@ -2972,8 +2756,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: @@ -2989,8 +2771,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 845b25a8f61b..59e52a86a2f5 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -77,8 +77,6 @@ define amdgpu_kernel void @fcmp_f16_lt( ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -167,8 +165,6 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -258,8 +254,6 @@ define amdgpu_kernel void @fcmp_f16_eq( ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -347,8 +341,6 @@ define amdgpu_kernel void @fcmp_f16_le( ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -436,8 +428,6 @@ define amdgpu_kernel void @fcmp_f16_gt( ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -525,8 +515,6 @@ define amdgpu_kernel void @fcmp_f16_lg( ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -614,8 +602,6 @@ define amdgpu_kernel void @fcmp_f16_ge( ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -703,8 +689,6 @@ define amdgpu_kernel void @fcmp_f16_o( ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -792,8 +776,6 @@ define amdgpu_kernel void @fcmp_f16_u( ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -881,8 +863,6 @@ define amdgpu_kernel void @fcmp_f16_nge( ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -970,8 +950,6 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1059,8 +1037,6 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1148,8 +1124,6 @@ define amdgpu_kernel void @fcmp_f16_nle( ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1237,8 +1211,6 @@ define amdgpu_kernel void @fcmp_f16_neq( ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1326,8 +1298,6 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1430,8 +1400,6 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1535,8 +1503,6 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1639,8 +1605,6 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1743,8 +1707,6 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1848,8 +1810,6 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1953,8 +1913,6 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2058,8 +2016,6 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2163,8 +2119,6 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2267,8 +2221,6 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2371,8 +2323,6 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2476,8 +2426,6 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2580,8 +2528,6 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2684,8 +2630,6 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -2788,8 +2732,6 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index e6f9889440f0..13367d3bb36e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -71,8 +71,6 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) store half %out, ptr addrspace(1) %arg_out @@ -125,8 +123,6 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 0.0) store half %result, ptr addrspace(1) %out, align 4 @@ -179,8 +175,6 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 1.0) store half %result, ptr addrspace(1) %out, align 4 @@ -233,8 +227,6 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 10.0) store half %result, ptr addrspace(1) %out, align 4 @@ -287,8 +279,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -1.0) store half %result, ptr addrspace(1) %out, align 4 @@ -341,8 +331,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -10.0) store half %result, ptr addrspace(1) %out, align 4 @@ -398,8 +386,6 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 0.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 @@ -459,8 +445,6 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 1.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 @@ -520,8 +504,6 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 10.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 @@ -580,8 +562,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -1.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 @@ -641,8 +621,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -10.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 @@ -915,8 +893,6 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -1019,8 +995,6 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -1120,8 +1094,6 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid @@ -1223,8 +1195,6 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid @@ -1326,8 +1296,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -1426,8 +1394,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -1531,8 +1497,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid @@ -1787,8 +1751,6 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 ; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) @@ -1877,8 +1839,6 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) store <2 x half> %out, ptr addrspace(1) %arg_out @@ -1989,8 +1949,6 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) store <3 x half> %out, ptr addrspace(1) %arg_out @@ -2118,8 +2076,6 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 542d67486e75..43cf26c422a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -40,8 +40,6 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -83,8 +81,6 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 0.0) store float %result, ptr addrspace(1) %out, align 4 @@ -126,8 +122,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 1.0) store float %result, ptr addrspace(1) %out, align 4 @@ -169,8 +163,6 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 10.0) store float %result, ptr addrspace(1) %out, align 4 @@ -212,8 +204,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -1.0) store float %result, ptr addrspace(1) %out, align 4 @@ -255,8 +245,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -10.0) store float %result, ptr addrspace(1) %out, align 4 @@ -298,8 +286,6 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 0.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -345,8 +331,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 1.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -391,8 +375,6 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 10.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -437,8 +419,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -1.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -483,8 +463,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -10.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 @@ -538,8 +516,6 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 @@ -603,8 +579,6 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3 ; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) store <3 x float> %result, ptr addrspace(1) %out, align 16 @@ -674,8 +648,6 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) store <4 x float> %result, ptr addrspace(1) %out, align 16 @@ -946,8 +918,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc) @@ -991,8 +961,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc) @@ -1038,8 +1006,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) @@ -1089,8 +1055,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float %result = call float @llvm.copysign.f32(float 1.0, float %sign.ext) @@ -1137,8 +1101,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext bfloat %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 4300faa02742..1bcc41320071 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -54,8 +54,6 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, ptr addrspace(1) %out, align 8 @@ -100,8 +98,6 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 0.0) store double %result, ptr addrspace(1) %out, align 8 @@ -146,8 +142,6 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 1.0) store double %result, ptr addrspace(1) %out, align 8 @@ -192,8 +186,6 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 10.0) store double %result, ptr addrspace(1) %out, align 8 @@ -238,8 +230,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -1.0) store double %result, ptr addrspace(1) %out, align 8 @@ -284,8 +274,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -10.0) store double %result, ptr addrspace(1) %out, align 8 @@ -338,8 +326,6 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) @@ -393,8 +379,6 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) @@ -437,8 +421,6 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 0.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 @@ -483,8 +465,6 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 1.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 @@ -529,8 +509,6 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 10.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 @@ -575,8 +553,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -1.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 @@ -621,8 +597,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -10.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 @@ -682,8 +656,6 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) store <2 x double> %result, ptr addrspace(1) %out, align 16 @@ -760,8 +732,6 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) store <3 x double> %result, ptr addrspace(1) %out, align 32 @@ -849,8 +819,6 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) store <4 x double> %result, ptr addrspace(1) %out, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 0c6805e3eba5..d1676b13c129 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -161,8 +161,6 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -262,8 +260,6 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -358,8 +354,6 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e64 v1, |v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -457,8 +451,6 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -540,8 +532,6 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -636,8 +626,6 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -735,8 +723,6 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rsq_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -840,8 +826,6 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -950,8 +934,6 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1056,8 +1038,6 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1161,8 +1141,6 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1266,8 +1244,6 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1378,8 +1354,6 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1491,8 +1465,6 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1565,8 +1537,6 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef %rcp = fdiv afn half %x, 2.0 @@ -1632,8 +1602,6 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef %rcp = fdiv afn half %x, 10.0 @@ -1699,8 +1667,6 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef %rcp = fdiv afn half %x, -10.0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 93105e57a591..f34739c5ca25 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -157,8 +157,6 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_ninf: @@ -310,8 +308,6 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_ieee: @@ -395,8 +391,6 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; GFX11-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_25ulp_f32: @@ -511,8 +505,6 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_25ulp_ieee_f32: @@ -575,8 +567,6 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_fast_ieee_f32: @@ -639,8 +629,6 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_fast_math: @@ -703,8 +691,6 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_f32_fast_math: @@ -868,8 +854,6 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_arcp_daz: @@ -932,8 +916,6 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_arcp_ninf: @@ -1188,8 +1170,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32: @@ -1266,8 +1246,6 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_v2f32: @@ -1344,8 +1322,6 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_fast_math: @@ -1422,8 +1398,6 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_arcp_math: @@ -1856,8 +1830,6 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v5, v6 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v4f32: @@ -1964,8 +1936,6 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_dual_mul_f32 v2, s2, v1 :: v_dual_mul_f32 v1, s1, v4 ; GFX11-NEXT: v_mul_f32_e32 v0, s0, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[8:9] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v4f32_fast_math: @@ -2072,8 +2042,6 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_dual_mul_f32 v2, s2, v1 :: v_dual_mul_f32 v1, s1, v4 ; GFX11-NEXT: v_mul_f32_e32 v0, s0, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[8:9] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v4f32_arcp_math: @@ -2248,8 +2216,6 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: @@ -2396,8 +2362,6 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll index a78a8a2ed8fe..67cfe5248254 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll @@ -27,8 +27,6 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen ; CHECK-NEXT: buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm bb: %i = icmp eq i32 %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index c5d4ef23070e..105174d7c9b3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4918,8 +4918,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: scratch_load_b32 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_large_offset: @@ -4927,8 +4925,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: sgpr_base_large_offset: @@ -4977,8 +4973,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-PAL-NEXT: s_nop 0 -; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: sgpr_base_large_offset: @@ -4986,8 +4980,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-PAL-NEXT: s_nop 0 -; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PAL-NEXT: s_endpgm entry: %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512 @@ -5028,8 +5020,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_large_offset_split: @@ -5039,8 +5029,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: sgpr_base_large_offset_split: @@ -5093,8 +5081,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX11-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-PAL-NEXT: s_nop 0 -; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: sgpr_base_large_offset_split: @@ -5104,8 +5090,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-PAL-NEXT: s_nop 0 -; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PAL-NEXT: s_endpgm entry: ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5) @@ -5257,8 +5241,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: sgpr_base_negative_offset: @@ -5266,8 +5248,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: sgpr_base_negative_offset: @@ -5314,8 +5294,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-PAL-NEXT: s_nop 0 -; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: sgpr_base_negative_offset: @@ -5323,8 +5301,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:-24 ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-PAL-NEXT: s_nop 0 -; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PAL-NEXT: s_endpgm entry: %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 4846e21fe836..c75521267ae7 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -54,8 +54,6 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -122,8 +120,6 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -183,8 +179,6 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -239,8 +233,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -307,8 +299,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -368,8 +358,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -436,8 +424,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -497,8 +483,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -567,8 +551,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -642,8 +624,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -743,8 +723,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: @@ -768,8 +746,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -869,8 +845,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] ; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: @@ -894,8 +868,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] ; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -987,8 +959,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: @@ -1004,8 +974,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1080,8 +1048,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: @@ -1097,8 +1063,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1173,8 +1137,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: @@ -1190,8 +1152,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1266,8 +1226,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: @@ -1283,8 +1241,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1359,8 +1315,6 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: @@ -1376,8 +1330,6 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1452,8 +1404,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: @@ -1469,8 +1419,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1545,8 +1493,6 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: @@ -1562,8 +1508,6 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1638,8 +1582,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: @@ -1655,8 +1597,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1731,8 +1671,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: @@ -1748,8 +1686,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1824,8 +1760,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: @@ -1841,8 +1775,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1917,8 +1849,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: @@ -1934,8 +1864,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -2010,8 +1938,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: @@ -2027,8 +1953,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -2124,8 +2048,6 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_interp: @@ -2143,8 +2065,6 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, @@ -2208,8 +2128,6 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] ; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_nop 0 -; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f64_interp: @@ -2227,8 +2145,6 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, @@ -2275,8 +2191,6 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -2323,8 +2237,6 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -2380,8 +2292,6 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1 ; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0 ; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index f86f5305e6ba..5caaa2c9550f 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -123,8 +123,6 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -255,8 +253,6 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -394,8 +390,6 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -534,8 +528,6 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 04cd150d9317..84c3913ec93c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -273,8 +273,6 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[4:5] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -297,8 +295,6 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f16 v1, v1, v2 ; GCN-NEXT: global_store_b16 v0, v1, s[4:5] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 4 %b = load volatile half, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 6634d36122d0..58e864b496b3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -105,8 +105,6 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -217,8 +215,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -330,8 +326,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -443,8 +437,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -561,8 +553,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -715,8 +705,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: @@ -735,8 +723,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -856,8 +842,6 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -963,8 +947,6 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1094,8 +1076,6 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: @@ -1114,8 +1094,6 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1288,8 +1266,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: @@ -1309,8 +1285,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1482,8 +1456,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: @@ -1503,8 +1475,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1676,8 +1646,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: @@ -1697,8 +1665,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -1873,8 +1839,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: @@ -1895,8 +1859,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2080,8 +2042,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: @@ -2103,8 +2063,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2284,8 +2242,6 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2443,8 +2399,6 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2597,8 +2551,6 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2751,8 +2703,6 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -2917,8 +2867,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3071,8 +3019,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3243,8 +3189,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: @@ -3264,8 +3208,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3419,8 +3361,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3573,8 +3513,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3727,8 +3665,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -3881,8 +3817,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4035,8 +3969,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4189,8 +4121,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4343,8 +4273,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4497,8 +4425,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4651,8 +4577,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4805,8 +4729,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -4959,8 +4881,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5113,8 +5033,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5267,8 +5185,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5421,8 +5337,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5578,8 +5492,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5785,8 +5697,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -5991,8 +5901,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: @@ -6016,8 +5924,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -6220,8 +6126,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -6408,8 +6312,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3 ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: @@ -6431,8 +6333,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 ; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -6603,8 +6503,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -6780,8 +6678,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -6957,8 +6853,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -7134,8 +7028,6 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: @@ -7155,8 +7047,6 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -7347,8 +7237,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, -v1, v2, v3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: @@ -7370,8 +7258,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -7531,8 +7417,6 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -7659,8 +7543,6 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -7860,8 +7742,6 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid @@ -7988,8 +7868,6 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: two_non_inline_constant: @@ -8005,8 +7883,6 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -8144,8 +8020,6 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -8329,8 +8203,6 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: @@ -8352,8 +8224,6 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 2025ddb07e83..714040405bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -123,8 +123,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -255,8 +253,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -394,8 +390,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -534,8 +528,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -743,8 +735,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 @@ -890,8 +880,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 3271758f7129..0353fc4f2b91 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -273,8 +273,6 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f32 v1, v1, v2 ; GCN-NEXT: global_store_b32 v0, v1, s[4:5] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -297,8 +295,6 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f16 v1, v1, v2 ; GCN-NEXT: global_store_b16 v0, v1, s[4:5] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 4 %b = load volatile half, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c60b9858abd8..57a960207180 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -59,8 +59,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0 ; GFX11-NEXT: v_fma_f32 v0, -v1, v0, 1.0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a11 = fadd float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) @@ -128,8 +126,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %mul2 = fmul fast float %x, 2.0 @@ -183,8 +179,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) @@ -245,8 +239,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %x.abs = call float @llvm.fabs.f32(float %x) @@ -296,8 +288,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %mul2 = fmul fast float %x, 2.0 @@ -346,8 +336,6 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 %mul2 = fmul fast float %x, 2.0 @@ -449,8 +437,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: @@ -473,8 +459,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -582,8 +566,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: @@ -601,8 +583,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -703,8 +683,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: @@ -722,8 +700,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -833,8 +809,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: @@ -854,8 +828,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -909,8 +881,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -962,8 +932,6 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 7b8384f317c6..e12c854f03c6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @fmul_f16( ; GFX11-NEXT: s_mov_b32 s9, s5 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -163,8 +161,6 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { @@ -248,8 +244,6 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -362,8 +356,6 @@ define amdgpu_kernel void @fmul_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -459,8 +451,6 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { @@ -554,8 +544,6 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -685,8 +673,6 @@ define amdgpu_kernel void @fmul_v4f16( ; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -797,8 +783,6 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 2b556a0be2b1..8298a925343b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -105,8 +105,6 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_f16: @@ -121,8 +119,6 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 @@ -235,8 +231,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: @@ -254,8 +248,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: @@ -270,8 +262,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 @@ -369,8 +359,6 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: @@ -385,8 +373,6 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 @@ -477,8 +463,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: @@ -494,8 +478,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -589,8 +571,6 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: @@ -606,8 +586,6 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -715,8 +693,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: @@ -734,8 +710,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: @@ -751,8 +725,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { @@ -863,8 +835,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: @@ -882,8 +852,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: @@ -899,8 +867,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { @@ -997,8 +963,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: @@ -1014,8 +978,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -1109,8 +1071,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: @@ -1126,8 +1086,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -1223,8 +1181,6 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: @@ -1240,8 +1196,6 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -1337,8 +1291,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: @@ -1354,8 +1306,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -1487,8 +1437,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: @@ -1508,8 +1456,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: @@ -1527,8 +1473,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -1662,8 +1606,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: @@ -1683,8 +1625,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: @@ -1702,8 +1642,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -1837,8 +1775,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: @@ -1858,8 +1794,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: @@ -1877,8 +1811,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -2013,8 +1945,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: @@ -2034,8 +1964,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: @@ -2053,8 +1981,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -2189,8 +2115,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: @@ -2210,8 +2134,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: @@ -2229,8 +2151,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -2366,8 +2286,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: @@ -2387,8 +2305,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: @@ -2406,8 +2322,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -2520,8 +2434,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: @@ -2539,8 +2451,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: @@ -2556,8 +2466,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -2667,8 +2575,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: @@ -2686,8 +2592,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-STRICT-NEXT: s_nop 0 -; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: @@ -2703,8 +2607,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 -; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index ba8b6fb80518..435d5bd79d09 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -60,8 +60,6 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %1 = call half @llvm.nearbyint.f16(half %in) store half %1, ptr addrspace(1) %out @@ -100,8 +98,6 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call float @llvm.nearbyint.f32(float %in) @@ -142,8 +138,6 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 ; GFX11-NEXT: v_rndne_f32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) @@ -192,8 +186,6 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-NEXT: v_rndne_f32_e32 v1, s5 ; GFX11-NEXT: v_rndne_f32_e32 v0, s4 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) @@ -253,8 +245,6 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call double @llvm.nearbyint.f64(double %in) @@ -329,8 +319,6 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) @@ -437,8 +425,6 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 8267bb9f5450..b821f9968490 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -57,8 +57,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs @@ -121,8 +119,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs @@ -180,8 +176,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -236,8 +230,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs @@ -280,8 +272,6 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) @@ -352,8 +342,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %in, %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %add) @@ -410,8 +398,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs @@ -455,8 +441,6 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) %fsub = fsub <4 x half> , %fabs @@ -523,8 +507,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs @@ -596,8 +578,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs @@ -678,8 +658,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 98b17bbaa0a9..4412e04e121a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1535,8 +1535,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = select i1 %z, double %x, double %y %b = fneg double %a diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 40982347f3ca..b9dd2727b367 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -51,8 +51,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub half -0.0, %in store half %fneg, ptr addrspace(1) %out @@ -112,8 +110,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid @@ -170,8 +166,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc @@ -231,8 +225,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %fsub = fsub half -0.0, %val @@ -287,8 +279,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub <2 x half> , %in store <2 x half> %fneg, ptr addrspace(1) %out @@ -334,8 +324,6 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %in = call i32 asm sideeffect "; def $0", "=s"() %in.bc = bitcast i32 %in to <2 x half> @@ -395,8 +383,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -453,8 +439,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> %fsub = fsub <2 x half> , %bc @@ -525,8 +509,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fsub = fsub <2 x half> , %val @@ -605,8 +587,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val @@ -663,8 +643,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index e447429539e6..d8809132883a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -39,8 +39,6 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub float -0.000000e+00, %in store float %fneg, ptr addrspace(1) %out @@ -86,8 +84,6 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub <2 x float> , %in store <2 x float> %fneg, ptr addrspace(1) %out @@ -146,8 +142,6 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub <4 x float> , %in store <4 x float> %fneg, ptr addrspace(1) %out @@ -186,8 +180,6 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float 0.0, %bc @@ -230,8 +222,6 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float %fsub = fsub float -0.0, %bc @@ -271,8 +261,6 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fsub = fsub float -0.0, %in %fmul = fmul float %fsub, %in @@ -313,8 +301,6 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %in.bc = bitcast float %in to i32 %int.abs = xor i32 %in.bc, 2147483648 @@ -359,8 +345,6 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 store i32 %fneg, ptr addrspace(1) %out @@ -409,8 +393,6 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float @@ -467,8 +449,6 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 store i64 %fneg, ptr addrspace(1) %out @@ -515,8 +495,6 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double @@ -582,8 +560,6 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half @@ -657,8 +633,6 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, @@ -741,8 +715,6 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> %fneg = xor <2 x i16> %in, diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index e94e2ee9f37d..a0578756433f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -11,15 +11,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -29,15 +25,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -96,15 +88,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -141,15 +129,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -159,15 +143,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32> ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index fb731cc00d3f..038e7b4f5e2b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -44,8 +44,6 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -92,8 +90,6 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 @@ -129,8 +125,6 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 @@ -177,8 +171,6 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -227,8 +219,6 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.fabs = tail call float @llvm.fabs.f32(float %x) #3 %cmpinf = fcmp one float %x.fabs, 0x7FF0000000000000 @@ -274,8 +264,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -331,8 +319,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %ninf = fcmp une float %x, 0x7FF0000000000000 @@ -385,8 +371,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 @@ -442,8 +426,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp uno float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -492,8 +474,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -542,8 +522,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -603,8 +581,6 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, %y %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -654,8 +630,6 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 %cmp = fcmp oeq half %fabs, 0xH7C00 @@ -706,8 +680,6 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 @@ -760,8 +732,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 742aeb96fcc2..626a22653f7d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -68,8 +68,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: @@ -80,8 +78,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: @@ -134,8 +130,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -180,8 +174,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32: @@ -189,8 +181,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32: @@ -230,8 +220,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -453,8 +441,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: @@ -465,8 +451,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: @@ -519,8 +503,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -565,8 +547,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32: @@ -574,8 +554,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32: @@ -615,8 +593,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -688,8 +664,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: @@ -702,8 +676,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[6:7] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: @@ -767,8 +739,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index 950d228f2992..ba3735c11861 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -66,8 +66,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: @@ -120,8 +118,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -166,8 +162,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f32(ptr addrspace(8) inreg ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32: @@ -207,8 +201,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f32(ptr addrspace(8) inreg ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -417,8 +409,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: @@ -471,8 +461,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -517,8 +505,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f32(ptr addrspace(8) inreg ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32: @@ -558,8 +544,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f32(ptr addrspace(8) inreg ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -631,8 +615,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] -; GFX1100-NEXT: s_nop 0 -; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: @@ -696,8 +678,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] -; G_GFX1100-NEXT: s_nop 0 -; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll index abe7a5cc43f4..a55c3d8c13df 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll @@ -9,8 +9,6 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_noret(ptr addrspace(1) %ptr, f ; GFX12-LABEL: global_atomic_fmin_num_f32_noret: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) ret void @@ -20,8 +18,6 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_noret(ptr addrspace(1) %ptr, f ; GFX12-LABEL: global_atomic_fmax_num_f32_noret: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) ret void @@ -33,8 +29,6 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, flo ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) store float %ret, ptr addrspace(1) %out @@ -47,8 +41,6 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, flo ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) store float %ret, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 064e88873a17..a169737493bc 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -61,8 +61,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32: @@ -81,8 +79,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp16_to_fp32: diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index 6c9f451167b7..865d64605f65 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -64,8 +64,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64: @@ -86,8 +84,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 5bac71007047..47b195a8f170 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -60,8 +60,6 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16: @@ -80,8 +78,6 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp32_to_fp16: diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index a40d678e84d7..d2e6b9266fa5 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fpext_f16_to_f32: @@ -96,8 +94,6 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -184,8 +180,6 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fpext_f16_to_f64: @@ -206,8 +200,6 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -296,8 +288,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32: @@ -319,8 +309,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -418,8 +406,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64: @@ -444,8 +430,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -491,8 +475,6 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32: @@ -505,8 +487,6 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm entry: %a.trunc = trunc i32 %a to i16 @@ -587,8 +567,6 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32: @@ -607,8 +585,6 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -691,8 +667,6 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32: @@ -711,8 +685,6 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -795,8 +767,6 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32: @@ -815,8 +785,6 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -920,8 +888,6 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32: @@ -944,8 +910,6 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -1049,8 +1013,6 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: @@ -1073,8 +1035,6 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -1177,8 +1137,6 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32: @@ -1201,8 +1159,6 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -1306,8 +1262,6 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: @@ -1330,8 +1284,6 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -1434,8 +1386,6 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: @@ -1458,8 +1408,6 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -1564,8 +1512,6 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: @@ -1588,8 +1534,6 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 327f2653c474..9e92a89501cf 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -59,8 +59,6 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i16: @@ -79,8 +77,6 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -148,8 +144,6 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i32: @@ -170,8 +164,6 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -245,8 +237,6 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i64: @@ -268,8 +258,6 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -351,8 +339,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16: @@ -377,8 +363,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -455,8 +439,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32: @@ -481,8 +463,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -569,8 +549,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64: @@ -598,8 +576,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -649,8 +625,6 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i1: @@ -665,8 +639,6 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptosi half %in to i1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index ba540f4948b5..804208998f9e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -59,8 +59,6 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i16: @@ -79,8 +77,6 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -148,8 +144,6 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i32: @@ -170,8 +164,6 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -245,8 +237,6 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i64: @@ -268,8 +258,6 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -350,8 +338,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16: @@ -376,8 +362,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -454,8 +438,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32: @@ -480,8 +462,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -567,8 +547,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64: @@ -595,8 +573,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -647,8 +623,6 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i1: @@ -663,8 +637,6 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptoui half %in to i1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 0817ac1b3cd6..0ea412a6b6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -115,8 +115,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: @@ -129,8 +127,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -256,8 +252,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: @@ -272,8 +266,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -409,8 +401,6 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: @@ -426,8 +416,6 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -578,8 +566,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: @@ -598,8 +584,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -717,8 +701,6 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: @@ -731,8 +713,6 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -851,8 +831,6 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: @@ -865,8 +843,6 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -985,8 +961,6 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: @@ -999,8 +973,6 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -1122,8 +1094,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: @@ -1138,8 +1108,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -1261,8 +1229,6 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: @@ -1277,8 +1243,6 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -1407,8 +1371,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: @@ -1423,8 +1385,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 1ba5e8f916cb..7a18e2ef7b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32: @@ -88,8 +86,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc double %in to float store float %result, ptr addrspace(1) %out @@ -483,8 +479,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-SAFE-SDAG-NEXT: s_nop 0 -; GFX11-SAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SAFE-SDAG-NEXT: s_endpgm ; ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: @@ -542,8 +536,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-SAFE-GISEL-NEXT: s_nop 0 -; GFX11-SAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SAFE-GISEL-NEXT: s_endpgm ; ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: @@ -556,8 +548,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-SDAG-NEXT: s_nop 0 -; GFX11-UNSAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-UNSAFE-SDAG-NEXT: s_endpgm ; ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: @@ -570,8 +560,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-GISEL-NEXT: s_nop 0 -; GFX11-UNSAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-UNSAFE-GISEL-NEXT: s_endpgm %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 @@ -653,8 +641,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32: @@ -668,8 +654,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out @@ -763,8 +747,6 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32: @@ -779,8 +761,6 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] ; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <3 x double> %in to <3 x float> store <3 x float> %result, ptr addrspace(1) %out @@ -873,8 +853,6 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32: @@ -890,8 +868,6 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, ptr addrspace(1) %out @@ -1015,8 +991,6 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32: @@ -1038,8 +1012,6 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index b3432c457d9a..a92015269f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -214,8 +214,6 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f16: @@ -253,8 +251,6 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -399,8 +395,6 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f16: @@ -422,8 +416,6 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -568,8 +560,6 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f16: @@ -591,8 +581,6 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -799,8 +787,6 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f32: @@ -837,8 +823,6 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -975,8 +959,6 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f32: @@ -998,8 +980,6 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -1136,8 +1116,6 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f32: @@ -1159,8 +1137,6 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -1380,8 +1356,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f64: @@ -1415,8 +1389,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -1609,8 +1581,6 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f64: @@ -1640,8 +1610,6 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -1834,8 +1802,6 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f64: @@ -1865,8 +1831,6 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -2203,8 +2167,6 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f16: @@ -2268,8 +2230,6 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -2826,8 +2786,6 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v4f16: @@ -2938,8 +2896,6 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -3242,8 +3198,6 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f32: @@ -3302,8 +3256,6 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -3798,8 +3750,6 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 ; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v4f32: @@ -3902,8 +3852,6 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 @@ -4233,8 +4181,6 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f64: @@ -4286,8 +4232,6 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[4:5] -; GFX1150-NEXT: s_nop 0 -; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 4f230140f7ba..8ad20d3da9a9 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -99,8 +99,6 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -171,8 +169,6 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) @@ -303,8 +299,6 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 ; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -390,8 +384,6 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) @@ -582,8 +574,6 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 ; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) @@ -689,8 +679,6 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) @@ -782,8 +770,6 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shl = shl i32 %a, 7 %xor = xor i32 %shl, %b diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 31f574d44ab8..551af1aa5cf7 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -91,8 +91,6 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) @@ -163,8 +161,6 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) @@ -265,8 +261,6 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -352,8 +346,6 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) @@ -484,8 +476,6 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) @@ -589,8 +579,6 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index 2a79793443fb..5495f0a8b0b7 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @fsub_f16( ; GFX11-NEXT: s_mov_b32 s9, s5 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -163,8 +161,6 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { @@ -248,8 +244,6 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -362,8 +356,6 @@ define amdgpu_kernel void @fsub_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -459,8 +451,6 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { @@ -554,8 +544,6 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll index 08da89ec0fb2..020c9dc130bb 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll @@ -9,8 +9,6 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace( ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in %sext = sext i8 %ld to i32 @@ -29,8 +27,6 @@ define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrsp ; DAG-NEXT: s_wait_kmcnt 0x0 ; DAG-NEXT: v_mov_b32_e32 v2, s0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_i8_imm: @@ -41,8 +37,6 @@ define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrsp ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100 %ld = load i8, ptr addrspace(4) %gep @@ -58,8 +52,6 @@ define amdgpu_ps void @test_s_load_i8_sgpr(ptr addrspace(4) inreg %in, i32 inreg ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %zext = zext i32 %offset to i64 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext @@ -76,8 +68,6 @@ define amdgpu_ps void @test_s_load_i8_sgpr_imm(ptr addrspace(4) inreg %in, i32 i ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16 %zext = zext i32 %offset to i64 @@ -94,8 +84,6 @@ define amdgpu_ps void @test_s_load_i8_divergent(ptr addrspace(4) inreg %in, i32 ; GCN-NEXT: global_load_i8 v0, v0, s[0:1] offset:16 ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_store_b32 v[1:2], v0, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16 %zext = zext i32 %offset to i64 @@ -113,8 +101,6 @@ define amdgpu_ps void @test_s_load_u8(ptr addrspace(4) inreg %in, ptr addrspace( ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in %zext = zext i8 %ld to i32 @@ -129,8 +115,6 @@ define amdgpu_ps void @test_s_load_u8_imm(ptr addrspace(4) inreg %in, ptr addrsp ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(4) %in, i64 255 %ld = load i8, ptr addrspace(4) %gep @@ -146,8 +130,6 @@ define amdgpu_ps void @test_s_load_u8_sgpr(ptr addrspace(4) inreg %in, i32 inreg ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %zext1 = zext i32 %offset to i64 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1 @@ -164,8 +146,6 @@ define amdgpu_ps void @test_s_load_u8_sgpr_imm(ptr addrspace(4) inreg %in, i32 i ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16 %zext1= zext i32 %offset to i64 @@ -182,8 +162,6 @@ define amdgpu_ps void @test_s_load_u8_divergent(ptr addrspace(4) inreg %in, i32 ; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_store_b32 v[1:2], v0, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16 %zext1= zext i32 %offset to i64 @@ -201,8 +179,6 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %ld = load i16, ptr addrspace(4) %in %sext = sext i16 %ld to i32 @@ -221,8 +197,6 @@ define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrs ; DAG-NEXT: s_wait_kmcnt 0x0 ; DAG-NEXT: v_mov_b32_e32 v2, s0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_i16_imm: @@ -233,8 +207,6 @@ define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrs ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100 %ld = load i16, ptr addrspace(4) %gep @@ -250,8 +222,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr(ptr addrspace(4) inreg %in, i32 inre ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %zext = zext i32 %offset to i64 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext @@ -272,8 +242,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32 ; DAG-NEXT: s_wait_kmcnt 0x0 ; DAG-NEXT: v_mov_b32_e32 v2, s0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_i16_sgpr_imm: @@ -287,8 +255,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16 %zext = zext i32 %offset to i64 @@ -311,8 +277,6 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32 ; DAG-NEXT: global_load_i16 v0, v[3:4], off offset:32 ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[1:2], v0, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_i16_divergent: @@ -328,8 +292,6 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32 ; GISEL-NEXT: global_load_i16 v0, v[0:1], off offset:32 ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: global_store_b32 v[3:4], v0, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16 %zext = zext i32 %offset to i64 @@ -347,8 +309,6 @@ define amdgpu_ps void @test_s_load_u16(ptr addrspace(4) inreg %in, ptr addrspace ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %ld = load i16, ptr addrspace(4) %in %zext = zext i16 %ld to i32 @@ -363,8 +323,6 @@ define amdgpu_ps void @test_s_load_u16_imm(ptr addrspace(4) inreg %in, ptr addrs ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(4) %in, i64 255 %ld = load i16, ptr addrspace(4) %gep @@ -380,8 +338,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr(ptr addrspace(4) inreg %in, i32 inre ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm %zext1 = zext i32 %offset to i64 %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1 @@ -402,8 +358,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32 ; DAG-NEXT: s_wait_kmcnt 0x0 ; DAG-NEXT: v_mov_b32_e32 v2, s0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_u16_sgpr_imm: @@ -417,8 +371,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16 %zext1= zext i32 %offset to i64 @@ -441,8 +393,6 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32 ; DAG-NEXT: global_load_u16 v0, v[3:4], off offset:32 ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[1:2], v0, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_s_load_u16_divergent: @@ -458,8 +408,6 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32 ; GISEL-NEXT: global_load_u16 v0, v[0:1], off offset:32 ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: global_store_b32 v[3:4], v0, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16 %zext1= zext i32 %offset to i64 @@ -477,8 +425,6 @@ define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr a ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0) @@ -494,8 +440,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspa ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0) @@ -511,8 +455,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %sr ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %off = add nuw nsw i32 %in, 100 @@ -528,8 +470,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> ; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent: @@ -537,8 +477,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0) @@ -556,8 +494,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0) @@ -575,8 +511,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrsp ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0) @@ -594,8 +528,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %s ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %off = add nuw nsw i32 %in, 100 @@ -611,8 +543,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32 ; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent: @@ -621,8 +551,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32 ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0) @@ -638,8 +566,6 @@ define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0) @@ -655,8 +581,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrsp ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0) @@ -672,8 +596,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %s ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %off = add nuw nsw i32 %in, 100 @@ -689,8 +611,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32 ; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent: @@ -698,8 +618,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32 ; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0) @@ -717,8 +635,6 @@ define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0) @@ -736,8 +652,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrs ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0) @@ -755,8 +669,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg % ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm main_body: %off = add nuw nsw i32 %in, 100 @@ -772,8 +684,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i3 ; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen ; DAG-NEXT: s_wait_loadcnt 0x0 ; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_nop 0 -; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent: @@ -782,8 +692,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i3 ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 059f925ee99a..dcd366e77944 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -23054,8 +23054,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX12-NEXT: .LBB92_2: -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: infer_as_before_atomic: @@ -23096,8 +23094,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[2:3] ; GFX11-NEXT: .LBB92_2: -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index a3c8bb141fd9..fb72dcacee4c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -2938,8 +2938,6 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-LABEL: global_inc_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2956,8 +2954,6 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-LABEL: global_inc_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3014,8 +3010,6 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-LABEL: global_inc_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3032,8 +3026,6 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-LABEL: global_inc_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3095,8 +3087,6 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-LABEL: global_dec_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3113,8 +3103,6 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-LABEL: global_dec_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3171,8 +3159,6 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-LABEL: global_dec_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3189,8 +3175,6 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-LABEL: global_dec_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index 790056b320d8..0b061da575a2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr: @@ -28,8 +26,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s ; GFX12-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -52,8 +48,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: @@ -61,8 +55,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace ; GFX12-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -86,8 +78,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: @@ -95,8 +85,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp ; GFX12-NEXT: global_load_b32 v0, v[0:1], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -143,8 +131,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs: @@ -155,8 +141,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -196,8 +180,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: @@ -208,8 +190,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -232,15 +212,11 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -257,15 +233,11 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -283,15 +255,11 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -308,15 +276,11 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -334,15 +298,11 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -359,15 +319,11 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -385,15 +341,11 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -410,15 +362,11 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -436,15 +384,11 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_p3_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -461,15 +405,11 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(ptr addrspa ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -487,15 +427,11 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i64_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -512,15 +448,11 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -538,15 +470,11 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(ptr addrspace(1) inreg % ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f64_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -563,15 +491,11 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(ptr addrsp ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -589,15 +513,11 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -614,15 +534,11 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -640,15 +556,11 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -665,15 +577,11 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -691,15 +599,11 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -716,15 +620,11 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -742,15 +642,11 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -767,15 +663,11 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -793,15 +685,11 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(ptr addrspace(1) inreg %s ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_p1_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -818,15 +706,11 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(ptr addrspa ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -844,15 +728,11 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -869,15 +749,11 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -895,15 +771,11 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -920,15 +792,11 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -946,15 +814,11 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -971,15 +835,11 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -997,15 +857,11 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1022,15 +878,11 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1048,15 +900,11 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1073,15 +921,11 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1099,15 +943,11 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1124,15 +964,11 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1150,15 +986,11 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1175,15 +1007,11 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1201,15 +1029,11 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1226,15 +1050,11 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1252,15 +1072,11 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1277,15 +1093,11 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1303,15 +1115,11 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1328,15 +1136,11 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr addr ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1354,15 +1158,11 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1379,15 +1179,11 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr addrs ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1405,15 +1201,11 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(ptr addrspace(1) inreg ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1430,15 +1222,11 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr addrs ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1460,16 +1248,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1) ; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1486,16 +1270,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr ; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1513,16 +1293,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1) ; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1539,16 +1315,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr ; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1570,15 +1342,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(ptr addrspace(1) i ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1596,15 +1364,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1623,15 +1387,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr addrsp ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1650,15 +1410,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index ba2d48166b2e..24c08ec86051 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -124,8 +124,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -283,8 +281,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -417,8 +413,6 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -570,8 +564,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -700,8 +692,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -859,8 +849,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -993,8 +981,6 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -1146,8 +1132,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -1276,8 +1260,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -1435,8 +1417,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -1569,8 +1549,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -1722,8 +1700,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -1843,8 +1819,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -1993,8 +1967,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -2118,8 +2090,6 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst @@ -2262,8 +2232,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -2383,8 +2351,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -2533,8 +2499,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -2658,8 +2622,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst @@ -2802,8 +2764,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -2923,8 +2883,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -3073,8 +3031,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -3198,8 +3154,6 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst @@ -3342,8 +3296,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -3463,8 +3415,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -3613,8 +3563,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -3738,8 +3686,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst @@ -3882,8 +3828,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -4012,8 +3956,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -4171,8 +4113,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -4305,8 +4245,6 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -4458,8 +4396,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -4698,8 +4634,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -4857,8 +4791,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -4991,8 +4923,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -5144,8 +5074,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -5274,8 +5202,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -5433,8 +5359,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -5567,8 +5491,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst @@ -5720,8 +5642,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -5941,8 +5861,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -6116,8 +6034,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6265,8 +6181,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -6434,8 +6348,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6500,8 +6412,6 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %in, i64 4 @@ -6567,8 +6477,6 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4 @@ -6630,8 +6538,6 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8 @@ -6711,8 +6617,6 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index @@ -6792,8 +6696,6 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index @@ -6874,8 +6776,6 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index @@ -6930,8 +6830,6 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -6982,8 +6880,6 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8 @@ -7051,8 +6947,6 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -7120,8 +7014,6 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -7190,8 +7082,6 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index @@ -7320,8 +7210,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 @@ -7525,8 +7413,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index c3a197ce9985..fa0ab4537fcd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -166,8 +166,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -187,8 +185,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -341,8 +337,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-DPP-NEXT: .LBB0_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -362,8 +356,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-DPP-NEXT: .LBB0_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -14014,8 +14006,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14035,8 +14025,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14189,8 +14177,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-DPP-NEXT: .LBB18_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14210,8 +14196,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-DPP-NEXT: .LBB18_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -14368,8 +14352,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-NEXT: .LBB19_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14389,8 +14371,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-NEXT: .LBB19_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14543,8 +14523,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1164-DPP-NEXT: .LBB19_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -14564,8 +14542,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] ; GFX1132-DPP-NEXT: .LBB19_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 69c6adf0300c..9b9dd744945b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -124,8 +124,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: @@ -141,8 +139,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: @@ -253,8 +249,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB0_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: @@ -270,8 +264,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB0_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1 ret void @@ -1158,8 +1150,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1175,8 +1165,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1287,8 +1275,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB2_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: @@ -1304,8 +1290,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB2_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1 ret void @@ -2194,8 +2178,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2211,8 +2193,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2323,8 +2303,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB4_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: @@ -2340,8 +2318,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB4_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1 ret void @@ -8957,8 +8933,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB12_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -8974,8 +8948,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB12_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9086,8 +9058,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB12_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9103,8 +9073,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB12_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -9219,8 +9187,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB13_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9236,8 +9202,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB13_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9348,8 +9312,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB13_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9365,8 +9327,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB13_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index b7890f30f776..21a65851db1d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -124,8 +124,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: @@ -141,8 +139,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: @@ -253,8 +249,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB0_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: @@ -270,8 +264,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB0_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1 ret void @@ -1158,8 +1150,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1175,8 +1165,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1287,8 +1275,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB2_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: @@ -1304,8 +1290,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB2_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1 ret void @@ -2194,8 +2178,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2211,8 +2193,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2323,8 +2303,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB4_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: @@ -2340,8 +2318,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB4_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1 ret void @@ -8957,8 +8933,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB12_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -8974,8 +8948,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB12_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9086,8 +9058,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB12_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9103,8 +9073,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB12_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 ret void @@ -9219,8 +9187,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-NEXT: .LBB13_2: -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9236,8 +9202,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-NEXT: .LBB13_2: -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9348,8 +9312,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1164-DPP-NEXT: .LBB13_2: -; GFX1164-DPP-NEXT: s_nop 0 -; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -9365,8 +9327,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] ; GFX1132-DPP-NEXT: .LBB13_2: -; GFX1132-DPP-NEXT: s_nop 0 -; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 3735c6349fbb..1109a0a25349 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -36,8 +36,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store half %arg, ptr addrspace(1) %out ret void @@ -74,8 +72,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <2 x half> %arg, ptr addrspace(1) %out ret void @@ -107,8 +103,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <3 x half> %arg, ptr addrspace(1) %out ret void @@ -135,8 +129,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out ret void @@ -181,8 +173,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out ret void @@ -226,8 +216,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fpext = fpext <2 x half> %in to <2 x float> store <2 x float> %fpext, ptr addrspace(1) %out @@ -266,8 +254,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext half %arg to float store float %ext, ptr addrspace(1) %out @@ -312,8 +298,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x float> store <2 x float> %ext, ptr addrspace(1) %out @@ -357,8 +341,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> store <3 x float> %ext, ptr addrspace(1) %out @@ -408,8 +390,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> store <4 x float> %ext, ptr addrspace(1) %out @@ -495,8 +475,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, ptr addrspace(1) %out @@ -541,8 +519,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext half %arg to double store double %ext, ptr addrspace(1) %out @@ -596,8 +572,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x double> store <2 x double> %ext, ptr addrspace(1) %out @@ -664,8 +638,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x double> store <3 x double> %ext, ptr addrspace(1) %out @@ -743,8 +715,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, ptr addrspace(1) %out @@ -878,8 +848,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x double> store <8 x double> %ext, ptr addrspace(1) %out @@ -908,8 +876,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in store half %val, ptr addrspace(1) %out @@ -938,8 +904,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in store <2 x half> %val, ptr addrspace(1) %out @@ -968,8 +932,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in store <4 x half> %val, ptr addrspace(1) %out @@ -998,8 +960,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in store <8 x half> %val, ptr addrspace(1) %out @@ -1030,8 +990,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %cvt = fpext half %val to float @@ -1083,8 +1041,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %cvt = fpext <2 x half> %val to <2 x float> @@ -1139,8 +1095,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in %cvt = fpext <3 x half> %val to <3 x float> @@ -1200,8 +1154,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in %cvt = fpext <4 x half> %val to <4 x float> @@ -1288,8 +1240,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v12, v[4:7], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in %cvt = fpext <8 x half> %val to <8 x float> @@ -1445,8 +1395,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; GFX11-NEXT: global_store_b128 v20, v[0:3], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v20, v[12:15], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v20, v[8:11], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x float> @@ -1481,8 +1429,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %cvt = fpext half %val to double @@ -1541,8 +1487,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %cvt = fpext <2 x half> %val to <2 x double> @@ -1619,8 +1563,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in %cvt = fpext <3 x half> %val to <3 x double> @@ -1706,8 +1648,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in %cvt = fpext <4 x half> %val to <4 x double> @@ -1840,8 +1780,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in %cvt = fpext <8 x half> %val to <8 x double> @@ -2090,8 +2028,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v32, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x double> @@ -2123,8 +2059,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in %cvt = fptrunc float %val to half @@ -2178,8 +2112,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x float>, ptr addrspace(1) %in %cvt = fptrunc <2 x float> %val to <2 x half> @@ -2248,8 +2180,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x float>, ptr addrspace(1) %in %cvt = fptrunc <3 x float> %val to <3 x half> @@ -2313,8 +2243,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; GFX11-NEXT: v_pack_b32_f16 v1, v2, v3 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v5 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x float>, ptr addrspace(1) %in %cvt = fptrunc <4 x float> %val to <4 x half> @@ -2413,8 +2341,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x float>, ptr addrspace(1) %in %cvt = fptrunc <8 x float> %val to <8 x half> @@ -2593,8 +2519,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x float>, ptr addrspace(1) %in %cvt = fptrunc <16 x float> %val to <16 x half> @@ -2644,8 +2568,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd half %a, %b store half %add, ptr addrspace(1) %out, align 4 @@ -2698,8 +2620,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %a, %b store <2 x half> %add, ptr addrspace(1) %out, align 8 @@ -2773,8 +2693,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_pk_add_f16 v1, v1, v3 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1 %a = load <4 x half>, ptr addrspace(1) %in, align 16 @@ -2897,8 +2815,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-NEXT: v_pk_add_f16 v1, s5, s9 ; GFX11-NEXT: v_pk_add_f16 v0, s4, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <8 x half> %a, %b store <8 x half> %add, ptr addrspace(1) %out, align 32 @@ -2927,8 +2843,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %val_int = bitcast half %val to i16 @@ -2958,8 +2872,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to half diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 5eb4d9b7a2be..35b6bfbee111 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -149,8 +149,6 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -310,8 +308,6 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -469,8 +465,6 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -614,8 +608,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -729,8 +721,6 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b16 v4, v2, s[0:1] ; GFX11-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -851,8 +841,6 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -986,8 +974,6 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -1129,8 +1115,6 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_scc0 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: br label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index fee59455da4c..108d85e024ad 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -149,8 +149,6 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -360,8 +358,6 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -547,8 +543,6 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -746,8 +740,6 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -940,8 +932,6 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1211,8 +1201,6 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 ; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1382,8 +1370,6 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1564,8 +1550,6 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1753,8 +1737,6 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1923,8 +1905,6 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2131,8 +2111,6 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -2339,8 +2317,6 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -2546,8 +2522,6 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -2741,8 +2715,6 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -2944,8 +2916,6 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -3182,8 +3152,6 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -3442,8 +3410,6 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 2894ae76c0be..e146cea50fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -147,8 +147,6 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -345,8 +343,6 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 ; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -533,8 +529,6 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -710,8 +704,6 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 ; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -879,8 +871,6 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1059,8 +1049,6 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1259,8 +1247,6 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1470,8 +1456,6 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_add3_u32 v0, s2, v2, v0 ; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1713,8 +1697,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 ; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1977,8 +1959,6 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 ; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2164,8 +2144,6 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2426,8 +2404,6 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 ; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2669,8 +2645,6 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0 ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2836,8 +2810,6 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3016,8 +2988,6 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3203,8 +3173,6 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3373,8 +3341,6 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3580,8 +3546,6 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -3788,8 +3752,6 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -3997,8 +3959,6 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -4191,8 +4151,6 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -4393,8 +4351,6 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -4630,8 +4586,6 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s2 ; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -4874,8 +4828,6 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5081,8 +5033,6 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5270,8 +5220,6 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5461,8 +5409,6 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5619,8 +5565,6 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5820,8 +5764,6 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -6014,8 +5956,6 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -6194,8 +6134,6 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 ; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: s_nop 0 -; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll index 83b650e2d755..2824d5daea66 100644 --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -63,8 +63,6 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0: @@ -153,8 +151,6 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1: @@ -243,8 +239,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0: @@ -333,8 +327,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1: @@ -423,8 +415,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3: @@ -525,8 +515,6 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7: @@ -624,8 +612,6 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15: diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index ecece35337a7..02a9169c0e6f 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -27,8 +27,6 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: @@ -75,8 +73,6 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_0.0_f16: @@ -121,8 +117,6 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_imm_neg_0.0_f16: @@ -167,8 +161,6 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_0.5_f16: @@ -213,8 +205,6 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_m_0.5_f16: @@ -259,8 +249,6 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_1.0_f16: @@ -305,8 +293,6 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_m_1.0_f16: @@ -351,8 +337,6 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_2.0_f16: @@ -397,8 +381,6 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_m_2.0_f16: @@ -443,8 +425,6 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_4.0_f16: @@ -489,8 +469,6 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_m_4.0_f16: @@ -535,8 +513,6 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_inv_2pi_f16: @@ -581,8 +557,6 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: @@ -627,8 +601,6 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: store_literal_imm_f16: @@ -677,8 +649,6 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_0.0_f16: @@ -733,8 +703,6 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_0.5_f16: @@ -789,8 +757,6 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: @@ -845,8 +811,6 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_1.0_f16: @@ -901,8 +865,6 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: @@ -957,8 +919,6 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_2.0_f16: @@ -1013,8 +973,6 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: @@ -1069,8 +1027,6 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_4.0_f16: @@ -1125,8 +1081,6 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: @@ -1193,8 +1147,6 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: @@ -1275,8 +1227,6 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: commute_add_literal_f16: @@ -1345,8 +1295,6 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_1_f16: @@ -1401,8 +1349,6 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_2_f16: @@ -1457,8 +1403,6 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_16_f16: @@ -1525,8 +1469,6 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_1_f16: @@ -1606,8 +1548,6 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_2_f16: @@ -1687,8 +1627,6 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_16_f16: @@ -1756,8 +1694,6 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_63_f16: @@ -1812,8 +1748,6 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_64_f16: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 647870f0e089..a5e0e5fdcb9a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -41,8 +41,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 @@ -109,8 +107,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -192,8 +188,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s1 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt1 = extractelement <2 x i16> %vec, i32 1 @@ -260,8 +254,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 @@ -343,8 +335,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 @@ -442,8 +432,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s1 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 @@ -496,8 +484,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 @@ -563,8 +549,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 @@ -611,8 +595,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 @@ -657,8 +639,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 @@ -728,8 +708,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -806,8 +784,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -881,8 +857,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -957,8 +931,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1030,8 +1002,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1105,8 +1075,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1178,8 +1146,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1253,8 +1219,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1326,8 +1290,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1415,8 +1377,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = load volatile i32, ptr addrspace(4) %idx.ptr %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -1498,8 +1458,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1602,8 +1560,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1684,8 +1640,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1765,8 +1719,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1847,8 +1799,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1928,8 +1878,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2010,8 +1958,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2120,8 +2066,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2224,8 +2168,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; GFX11-NEXT: v_bfi_b32 v1, s1, s2, v1 ; GFX11-NEXT: v_bfi_b32 v0, s0, s2, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2306,8 +2248,6 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2389,8 +2329,6 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2619,8 +2557,6 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2723,8 +2659,6 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2827,8 +2761,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -3231,8 +3163,6 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index 026a8d7da708..d26f0df49b0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -574,8 +574,6 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %cmp = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll index b3079f5c9787..8ae571df670a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll @@ -68,8 +68,6 @@ define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1 ; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 113927100311..7f1c01a7a000 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -46,8 +46,6 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float @@ -63,8 +61,6 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float @@ -78,8 +74,6 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float @@ -93,8 +87,6 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float @@ -108,8 +100,6 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr a ; GFX12: ; %bb.0: ; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: global_store_b32 v[3:4], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index eed85345b3b1..7524c7cbda6c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -56,8 +56,6 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, ptr addrspace(1) %out @@ -117,8 +115,6 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, ptr addrspace(1) %out @@ -230,8 +226,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -312,8 +306,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -392,8 +384,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -493,8 +483,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -597,8 +585,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -701,8 +687,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -806,8 +790,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll index 29535726b749..081727c3b5e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll @@ -20,8 +20,6 @@ define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b32 v[1:2], v3, off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) store i32 %res, ptr addrspace(1) %out, align 4 @@ -43,8 +41,6 @@ define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) store i64 %res, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll index 0ce3f85425e0..644ecf28fd32 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll @@ -11,8 +11,6 @@ define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; CHECK-NEXT: global_store_b32 v[6:7], v0, off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) %vdst = extractvalue { i32, i32 } %pair, 0 @@ -29,8 +27,6 @@ define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %dat ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; CHECK-NEXT: global_store_b32 v[6:7], v0, off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1) %vdst = extractvalue { i32, i32 } %pair, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll index 96c35df3c234..63d4bac20b35 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll @@ -20,8 +20,6 @@ define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b32 v[1:2], v3, off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) store i32 %res, ptr addrspace(1) %out, align 4 @@ -43,8 +41,6 @@ define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) store i64 %res, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index dad77d1efd3a..18b03efe44ef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -21,8 +21,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -44,8 +42,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -72,8 +68,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: @@ -95,8 +89,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: @@ -130,8 +122,6 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32: @@ -157,8 +147,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: @@ -184,8 +172,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: @@ -215,8 +201,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_one: @@ -242,8 +226,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_one: @@ -273,8 +255,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: @@ -300,8 +280,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: @@ -331,8 +309,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: @@ -358,8 +334,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: @@ -389,8 +363,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: @@ -416,8 +388,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: @@ -447,8 +417,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: @@ -474,8 +442,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: @@ -505,8 +471,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_o: @@ -532,8 +496,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_o: @@ -563,8 +525,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: @@ -590,8 +550,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: @@ -621,8 +579,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: @@ -648,8 +604,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: @@ -679,8 +633,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_une: @@ -706,8 +658,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_une: @@ -737,8 +687,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: @@ -764,8 +712,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: @@ -795,8 +741,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: @@ -822,8 +766,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: @@ -853,8 +795,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: @@ -880,8 +820,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: @@ -911,8 +849,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: @@ -938,8 +874,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: @@ -967,8 +901,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: @@ -990,8 +922,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: @@ -1017,8 +947,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: @@ -1040,8 +968,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: @@ -1067,8 +993,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: @@ -1090,8 +1014,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: @@ -1117,8 +1039,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: @@ -1140,8 +1060,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: @@ -1167,8 +1085,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: @@ -1190,8 +1106,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: @@ -1217,8 +1131,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: @@ -1240,8 +1152,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: @@ -1267,8 +1177,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: @@ -1290,8 +1198,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: @@ -1317,8 +1223,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: @@ -1340,8 +1244,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: @@ -1367,8 +1269,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: @@ -1390,8 +1290,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: @@ -1417,8 +1315,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: @@ -1440,8 +1336,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: @@ -1467,8 +1361,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: @@ -1490,8 +1382,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: @@ -1517,8 +1407,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: @@ -1540,8 +1428,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: @@ -1567,8 +1453,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: @@ -1590,8 +1474,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: @@ -1617,8 +1499,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: @@ -1640,8 +1520,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: @@ -1671,8 +1549,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: @@ -1700,8 +1576,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: @@ -1735,8 +1609,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: @@ -1764,8 +1636,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: @@ -1802,8 +1672,6 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16: @@ -1830,8 +1698,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: @@ -1857,8 +1723,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: @@ -1889,8 +1753,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_one: @@ -1916,8 +1778,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_one: @@ -1948,8 +1808,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: @@ -1975,8 +1833,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: @@ -2007,8 +1863,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: @@ -2034,8 +1888,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: @@ -2066,8 +1918,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: @@ -2093,8 +1943,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: @@ -2125,8 +1973,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: @@ -2152,8 +1998,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: @@ -2184,8 +2028,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: @@ -2211,8 +2053,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: @@ -2243,8 +2083,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_une: @@ -2270,8 +2108,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_une: @@ -2302,8 +2138,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: @@ -2329,8 +2163,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: @@ -2361,8 +2193,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: @@ -2388,8 +2218,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: @@ -2420,8 +2248,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: @@ -2447,8 +2273,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: @@ -2478,8 +2302,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_o: @@ -2505,8 +2327,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_o: @@ -2536,8 +2356,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: @@ -2563,8 +2381,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: @@ -2594,8 +2410,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: @@ -2621,8 +2435,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index d1883d9196af..2dddf37febf9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -24,8 +24,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: @@ -82,8 +80,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: @@ -141,8 +137,6 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX9-SDAG-LABEL: v_fcmp_f32: @@ -187,8 +181,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oeq: @@ -249,8 +241,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_one: @@ -311,8 +301,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ogt: @@ -373,8 +361,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oge: @@ -435,8 +421,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_olt: @@ -497,8 +481,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ole: @@ -559,8 +541,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_o: @@ -621,8 +601,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_uo: @@ -683,8 +661,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ueq: @@ -745,8 +721,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_une: @@ -807,8 +781,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ugt: @@ -869,8 +841,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_uge: @@ -931,8 +901,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ult: @@ -993,8 +961,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_ule: @@ -1053,8 +1019,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_oeq: @@ -1113,8 +1077,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_one: @@ -1173,8 +1135,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ogt: @@ -1233,8 +1193,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_oge: @@ -1293,8 +1251,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_olt: @@ -1353,8 +1309,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ole: @@ -1413,8 +1367,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ueq: @@ -1473,8 +1425,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_o: @@ -1533,8 +1483,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_uo: @@ -1593,8 +1541,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_une: @@ -1653,8 +1599,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ugt: @@ -1713,8 +1657,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_uge: @@ -1773,8 +1715,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ult: @@ -1833,8 +1773,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f64_ule: @@ -1898,8 +1836,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: @@ -1967,8 +1903,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: @@ -2032,8 +1966,6 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX9-SDAG-LABEL: v_fcmp_f16: @@ -2079,8 +2011,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_oeq: @@ -2142,8 +2072,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_one: @@ -2205,8 +2133,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ogt: @@ -2268,8 +2194,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_oge: @@ -2331,8 +2255,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_olt: @@ -2394,8 +2316,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ole: @@ -2457,8 +2377,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ueq: @@ -2520,8 +2438,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_une: @@ -2583,8 +2499,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ugt: @@ -2646,8 +2560,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_uge: @@ -2709,8 +2621,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ult: @@ -2771,8 +2681,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_o: @@ -2833,8 +2741,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_uo: @@ -2895,8 +2801,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f16_ule: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 453913b334a4..455323d01eb3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -17,8 +17,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -70,8 +68,6 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, <2 x bfloat> inreg %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 1343f25ec275..9cf24539828b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -16,8 +16,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 8a8b0490e948..5a2c4197eef5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -17,8 +17,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -47,8 +45,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index f631a0bfc28e..a97f1dcc2bdd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -40,8 +40,6 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: @@ -55,8 +53,6 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 88e392981931..e19a4bcf9e0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -16,8 +16,6 @@ define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr ; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -35,8 +33,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -54,8 +50,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -73,8 +67,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, pt ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index a5841e04f6e0..5e1fe792393b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -16,8 +16,6 @@ define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr ad ; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -35,8 +33,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -54,8 +50,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 @@ -73,8 +67,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, pt ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index d0d759a57a68..37174dec5202 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -29,8 +29,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_eq: @@ -56,8 +54,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_eq: @@ -91,8 +87,6 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32: @@ -118,8 +112,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_ne: @@ -145,8 +137,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_ne: @@ -176,8 +166,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: @@ -203,8 +191,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: @@ -234,8 +220,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_uge: @@ -261,8 +245,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_uge: @@ -292,8 +274,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_ult: @@ -319,8 +299,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_ult: @@ -350,8 +328,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_ule: @@ -377,8 +353,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_ule: @@ -408,8 +382,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: @@ -435,8 +407,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: @@ -466,8 +436,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_sge: @@ -493,8 +461,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_sge: @@ -524,8 +490,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_slt: @@ -551,8 +515,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_slt: @@ -582,8 +544,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i32_sle: @@ -609,8 +569,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i32_sle: @@ -638,8 +596,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: @@ -661,8 +617,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: @@ -688,8 +642,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: @@ -711,8 +663,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: @@ -738,8 +688,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: @@ -761,8 +709,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: @@ -788,8 +734,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: @@ -811,8 +755,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: @@ -838,8 +780,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: @@ -861,8 +801,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: @@ -888,8 +826,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: @@ -911,8 +847,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: @@ -938,8 +872,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: @@ -961,8 +893,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: @@ -988,8 +918,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: @@ -1011,8 +939,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: @@ -1038,8 +964,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: @@ -1061,8 +985,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: @@ -1088,8 +1010,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: @@ -1111,8 +1031,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: @@ -1140,8 +1058,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_eq: @@ -1167,8 +1083,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_eq: @@ -1202,8 +1116,6 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16: @@ -1229,8 +1141,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_ne: @@ -1256,8 +1166,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_ne: @@ -1287,8 +1195,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: @@ -1314,8 +1220,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: @@ -1345,8 +1249,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_uge: @@ -1372,8 +1274,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_uge: @@ -1403,8 +1303,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_ult: @@ -1430,8 +1328,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_ult: @@ -1461,8 +1357,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_ule: @@ -1488,8 +1382,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_ule: @@ -1519,8 +1411,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: @@ -1546,8 +1436,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: @@ -1577,8 +1465,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_sge: @@ -1604,8 +1490,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_sge: @@ -1635,8 +1519,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_slt: @@ -1662,8 +1544,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_slt: @@ -1693,8 +1573,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_icmp_i16_sle: @@ -1720,8 +1598,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_icmp_i16_sle: @@ -1753,8 +1629,6 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: v_icmp_i1_ne0: @@ -1790,8 +1664,6 @@ define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32 ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: test_intr_icmp_i32_invalid_cc: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index cad3c54ae54b..f5f3bc43658e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -34,8 +34,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_eq: @@ -102,8 +100,6 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32: @@ -140,8 +136,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_ne: @@ -202,8 +196,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: @@ -264,8 +256,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_uge: @@ -326,8 +316,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_ult: @@ -388,8 +376,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_ule: @@ -450,8 +436,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: @@ -512,8 +496,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_sge: @@ -574,8 +556,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_slt: @@ -636,8 +616,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i32_sle: @@ -696,8 +674,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_eq: @@ -756,8 +732,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_ne: @@ -816,8 +790,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: @@ -876,8 +848,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_u64_uge: @@ -936,8 +906,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_u64_ult: @@ -996,8 +964,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_u64_ule: @@ -1056,8 +1022,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: @@ -1116,8 +1080,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_sge: @@ -1176,8 +1138,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_slt: @@ -1236,8 +1196,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i64_sle: @@ -1298,8 +1256,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_eq: @@ -1366,8 +1322,6 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16: @@ -1404,8 +1358,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_ne: @@ -1466,8 +1418,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: @@ -1528,8 +1478,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_uge: @@ -1590,8 +1538,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_ult: @@ -1652,8 +1598,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_ule: @@ -1714,8 +1658,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: @@ -1776,8 +1718,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_sge: @@ -1838,8 +1778,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_slt: @@ -1900,8 +1838,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; SDAG-VI-LABEL: v_icmp_i16_sle: @@ -1964,8 +1900,6 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; VI-LABEL: v_icmp_i1_ne0: @@ -2022,8 +1956,6 @@ define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32 ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: test_intr_icmp_i32_invalid_cc: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll index 31ba2f224bba..21482ba1adc1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll @@ -373,23 +373,14 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -403,23 +394,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -434,23 +416,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_3d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -466,23 +439,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_cube: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -498,23 +462,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -529,23 +484,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -561,23 +507,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_2dmsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2dmsaa: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2dmsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -593,23 +530,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darraymsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2darraymsaa: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2darraymsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -626,23 +554,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -657,23 +576,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_2d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -689,23 +599,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_3d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -722,23 +623,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_cube: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -755,23 +647,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_1darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -787,23 +670,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_2darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1060,23 +934,14 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX9-NEXT: image_store v0, v1, s[0:7] dmask:0x2 unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_V1: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1090,23 +955,14 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX9-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_V2: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1192,23 +1048,14 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_glc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_glc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1222,23 +1069,14 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_slc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1252,23 +1090,14 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX9-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16 ; GFX9-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_glc_slc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_glc_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1344,3 +1173,6 @@ declare void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float>, i32, i16, <8 x i attributes #0 = { nounwind } attributes #1 = { nounwind readonly } attributes #2 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll index 7c8a395b488f..dca5e671ab58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll @@ -465,15 +465,11 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX11-LABEL: store_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -495,15 +491,11 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX11-LABEL: store_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -526,15 +518,11 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX11-LABEL: store_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -558,15 +546,11 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX11-LABEL: store_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -590,15 +574,11 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX11-LABEL: store_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -621,15 +601,11 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX11-LABEL: store_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -653,15 +629,11 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_2dmsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -685,15 +657,11 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX11-LABEL: store_2darraymsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -718,15 +686,11 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_mip_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -749,15 +713,11 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_mip_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -781,15 +741,11 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_mip_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -814,15 +770,11 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX11-LABEL: store_mip_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -847,15 +799,11 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX11-LABEL: store_mip_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -879,15 +827,11 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX11-LABEL: store_mip_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1212,15 +1156,11 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX11-LABEL: store_1d_V1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x02,0x19,0xf0,0x01,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0x81,0xd0,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1242,15 +1182,11 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX11-LABEL: store_1d_V2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0c,0x19,0xf0,0x02,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0x01,0xd3,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1362,15 +1298,11 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_1d_glc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x10,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1392,15 +1324,11 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX11-LABEL: store_1d_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1422,15 +1350,11 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX11-LABEL: store_1d_glc_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x19,0xf0,0x04,0x00,0x00,0x00] -; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; GFX12-LABEL: store_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x30,0x00,0x04,0x00,0x00,0x00] -; GFX12-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX12-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index db72e6c6d0c1..2fced3240fe3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -2667,23 +2667,14 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 % ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2711,23 +2702,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 % ; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -2755,23 +2737,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 % ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_3d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -2799,23 +2772,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_cube: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -2843,23 +2807,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -2887,23 +2842,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -2931,23 +2877,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2dmsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2dmsaa: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2dmsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2dmsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -2975,23 +2912,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; NOPRT-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darraymsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_2darraymsaa: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_2darraymsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_2darraymsaa: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -3019,23 +2947,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3063,23 +2982,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_2d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3107,23 +3017,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_3d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3151,23 +3052,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_cube: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_cube: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3195,23 +3087,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_1darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_1darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3239,23 +3122,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_mip_2darray: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_mip_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_mip_2darray: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -3693,23 +3567,14 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) ; NOPRT-NEXT: image_store v0, v1, s[0:7] dmask:0x2 unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_V1: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_V1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3737,23 +3602,14 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i3 ; NOPRT-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_V2: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_V2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3904,23 +3760,14 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_glc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_glc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_glc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -3948,23 +3795,14 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_slc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -3992,23 +3830,14 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: store_1d_glc_slc: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: store_1d_glc_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: store_1d_glc_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -4202,23 +4031,13 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a ; NOPRT-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: image_store_wait: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: image_store_wait: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10PLUS-LABEL: image_store_wait: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: image_store_wait: ; GFX12: ; %bb.0: ; %main_body @@ -4226,8 +4045,6 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a ; GFX12-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll index 554b961beaf7..fe76d9ca1ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll @@ -18,15 +18,11 @@ define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 ; GFX11-LABEL: store_f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f16_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -49,15 +45,11 @@ define amdgpu_ps void @store_v2f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v2f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f16_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -80,15 +72,11 @@ define amdgpu_ps void @store_v3f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v3f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f16_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -111,15 +99,11 @@ define amdgpu_ps void @store_v4f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v4f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f16_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -142,15 +126,11 @@ define amdgpu_ps void @store_f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 ; GFX11-LABEL: store_f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f16_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -174,15 +154,11 @@ define amdgpu_ps void @store_v2f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v2f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f16_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -206,15 +182,11 @@ define amdgpu_ps void @store_v3f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v3f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f16_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -238,15 +210,11 @@ define amdgpu_ps void @store_v4f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v4f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f16_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -270,15 +238,11 @@ define amdgpu_ps void @store_f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, ; GFX11-LABEL: store_f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f16_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:3], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -303,15 +267,11 @@ define amdgpu_ps void @store_v2f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v2f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f16_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:3], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -336,15 +296,11 @@ define amdgpu_ps void @store_v3f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v3f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f16_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:3], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -369,15 +325,11 @@ define amdgpu_ps void @store_v4f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v4f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 d16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f16_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 d16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll index 5d4c91ad8a51..1110892f6ac0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll @@ -18,15 +18,11 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX11-LABEL: store_f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f32_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -48,15 +44,11 @@ define amdgpu_ps void @store_v2f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v2f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f32_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -78,15 +70,11 @@ define amdgpu_ps void @store_v3f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v3f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f32_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -108,15 +96,11 @@ define amdgpu_ps void @store_v4f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v4f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f32_1d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -138,15 +122,11 @@ define amdgpu_ps void @store_f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 ; GFX11-LABEL: store_f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f32_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -169,15 +149,11 @@ define amdgpu_ps void @store_v2f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v2f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f32_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -200,15 +176,11 @@ define amdgpu_ps void @store_v3f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v3f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f32_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -231,15 +203,11 @@ define amdgpu_ps void @store_v4f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, ; GFX11-LABEL: store_v4f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f32_2d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -262,15 +230,11 @@ define amdgpu_ps void @store_f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, ; GFX11-LABEL: store_f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_f32_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -294,15 +258,11 @@ define amdgpu_ps void @store_v2f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v2f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v2f32_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -326,15 +286,11 @@ define amdgpu_ps void @store_v3f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v3f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v3f32_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -358,15 +314,11 @@ define amdgpu_ps void @store_v4f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX11-LABEL: store_v4f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_v4f32_3d: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: image_store v[2:5], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index 3781faa54e7d..d08f826e8062 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -17,8 +17,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0) @@ -35,8 +33,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF) @@ -51,8 +47,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { ; GFX11-NEXT: s_movk_i32 s0, 0x1000 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000) @@ -69,8 +63,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) { ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) @@ -84,8 +76,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %o ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) @@ -105,8 +95,6 @@ define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace ; GFX11-NEXT: .LBB5_2: ; %endif ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %cc = icmp ne i32 %s2, 0 @@ -140,8 +128,6 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: inverse_ballot_branch: @@ -155,8 +141,6 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; SDAG-NEXT: ; %bb.2: ; %endif ; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; SDAG-NEXT: global_store_b32 v[0:1], v2, off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 29218a362521..2f82ceb37eb9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -18,8 +18,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: constant_false_inverse_ballot: @@ -29,8 +27,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0) @@ -48,8 +44,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: constant_true_inverse_ballot: @@ -59,8 +53,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF) @@ -79,8 +71,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: constant_mask_inverse_ballot: @@ -91,8 +81,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0x0040F8010000) @@ -111,8 +99,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) { ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[2:3], v[4:5], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: vgpr_inverse_ballot: @@ -123,8 +109,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) { ; SDAG-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) @@ -139,8 +123,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %o ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: sgpr_inverse_ballot: @@ -150,8 +132,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %o ; SDAG-NEXT: s_waitcnt_depctr 0xfffe ; SDAG-NEXT: v_mov_b32_e32 v3, s0 ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) @@ -173,8 +153,6 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: phi_uniform: @@ -189,8 +167,6 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %cc = icmp ne i64 %s2, 0 @@ -226,8 +202,6 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: inverse_ballot_branch: @@ -244,8 +218,6 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG-NEXT: ; %bb.2: ; %endif ; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 42e8b2608dc1..80cd97c0c262 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -98,8 +98,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index f8e60e5eb09a..8383621cef2f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index c4a86952bc41..aa6069c67f62 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -34,8 +34,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_i32: @@ -48,8 +46,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -79,8 +75,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_f32: @@ -93,8 +87,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -142,8 +134,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: @@ -158,8 +148,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: @@ -174,8 +162,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: @@ -190,8 +176,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -239,8 +223,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: @@ -255,8 +237,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: @@ -271,8 +251,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: @@ -287,8 +265,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -318,8 +294,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vii_i32: @@ -330,8 +304,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -361,8 +333,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vii_f32: @@ -373,8 +343,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -416,8 +384,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: @@ -430,8 +396,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: @@ -444,8 +408,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: @@ -458,8 +420,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -501,8 +461,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: @@ -515,8 +473,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: @@ -529,8 +485,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: @@ -543,8 +497,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -577,8 +529,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vll_i32: @@ -591,8 +541,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -638,8 +586,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: @@ -654,8 +600,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: @@ -670,8 +614,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: @@ -686,8 +628,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -719,8 +659,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vll_f32: @@ -733,8 +671,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -780,8 +716,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: @@ -796,8 +730,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: @@ -812,8 +744,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: @@ -828,8 +758,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -868,8 +796,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: @@ -887,8 +813,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: @@ -905,8 +829,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: @@ -922,8 +844,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -977,8 +897,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvv_i64: @@ -997,8 +915,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -1039,8 +955,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: @@ -1058,8 +972,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: @@ -1076,8 +988,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: @@ -1093,8 +1003,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -1148,8 +1056,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvv_f64: @@ -1168,8 +1074,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -1211,8 +1115,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: @@ -1225,8 +1127,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: @@ -1239,8 +1139,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: @@ -1253,8 +1151,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1308,8 +1204,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvs_i64: @@ -1327,8 +1221,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1369,8 +1261,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: @@ -1383,8 +1273,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: @@ -1397,8 +1285,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: @@ -1411,8 +1297,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1466,8 +1350,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvs_f64: @@ -1485,8 +1367,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1517,8 +1397,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: @@ -1532,8 +1410,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: @@ -1547,8 +1423,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: @@ -1562,8 +1436,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1617,8 +1489,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: @@ -1636,8 +1506,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: @@ -1655,8 +1523,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: @@ -1674,8 +1540,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1706,8 +1570,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: @@ -1721,8 +1583,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: @@ -1736,8 +1596,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: @@ -1751,8 +1609,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1806,8 +1662,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: @@ -1825,8 +1679,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: @@ -1844,8 +1696,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: @@ -1863,8 +1713,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1895,8 +1743,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: @@ -1909,8 +1755,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -1958,8 +1802,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: @@ -1974,8 +1816,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: @@ -1990,8 +1830,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: @@ -2006,8 +1844,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out @@ -2037,8 +1873,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: @@ -2051,8 +1885,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out @@ -2100,8 +1932,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: @@ -2116,8 +1946,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: @@ -2132,8 +1960,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: @@ -2148,8 +1974,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out @@ -2179,8 +2003,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: @@ -2193,8 +2015,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -2242,8 +2062,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: @@ -2258,8 +2076,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: @@ -2274,8 +2090,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: @@ -2290,8 +2104,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out @@ -2321,8 +2133,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: @@ -2335,8 +2145,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out @@ -2384,8 +2192,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: @@ -2400,8 +2206,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: @@ -2416,8 +2220,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: @@ -2432,8 +2234,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out @@ -2463,8 +2263,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: @@ -2477,8 +2275,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -2526,8 +2322,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: @@ -2542,8 +2336,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: @@ -2558,8 +2350,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: @@ -2574,8 +2364,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out @@ -2605,8 +2393,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: @@ -2619,8 +2405,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out @@ -2668,8 +2452,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: @@ -2684,8 +2466,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: @@ -2700,8 +2480,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: @@ -2716,8 +2494,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out @@ -2747,8 +2523,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_i32: @@ -2761,8 +2535,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -2792,8 +2564,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_f32: @@ -2806,8 +2576,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -2855,8 +2623,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: @@ -2871,8 +2637,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: @@ -2887,8 +2651,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: @@ -2903,8 +2665,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -2952,8 +2712,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: @@ -2968,8 +2726,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: @@ -2984,8 +2740,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: @@ -3000,8 +2754,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -3031,8 +2783,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vii_i32: @@ -3043,8 +2793,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -3074,8 +2822,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vii_f32: @@ -3086,8 +2832,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -3129,8 +2873,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: @@ -3143,8 +2885,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: @@ -3157,8 +2897,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: @@ -3171,8 +2909,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -3214,8 +2950,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: @@ -3228,8 +2962,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: @@ -3242,8 +2974,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: @@ -3256,8 +2986,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -3290,8 +3018,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vll_i32: @@ -3304,8 +3030,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -3337,8 +3061,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vll_f32: @@ -3351,8 +3073,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -3398,8 +3118,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: @@ -3414,8 +3132,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: @@ -3430,8 +3146,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: @@ -3446,8 +3160,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -3493,8 +3205,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: @@ -3509,8 +3219,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: @@ -3525,8 +3233,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: @@ -3541,8 +3247,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -3581,8 +3285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: @@ -3600,8 +3302,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: @@ -3618,8 +3318,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: @@ -3635,8 +3333,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -3677,8 +3373,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: @@ -3696,8 +3390,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: @@ -3714,8 +3406,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: @@ -3731,8 +3421,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -3786,8 +3474,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvv_i64: @@ -3806,8 +3492,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -3861,8 +3545,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvv_f64: @@ -3881,8 +3563,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -3924,8 +3604,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: @@ -3938,8 +3616,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: @@ -3952,8 +3628,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: @@ -3966,8 +3640,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -4008,8 +3680,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: @@ -4022,8 +3692,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: @@ -4036,8 +3704,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: @@ -4050,8 +3716,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -4105,8 +3769,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvs_i64: @@ -4124,8 +3786,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -4179,8 +3839,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvs_f64: @@ -4198,8 +3856,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -4230,8 +3886,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: @@ -4245,8 +3899,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: @@ -4260,8 +3912,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: @@ -4275,8 +3925,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4307,8 +3955,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: @@ -4322,8 +3968,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: @@ -4337,8 +3981,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: @@ -4352,8 +3994,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4407,8 +4047,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: @@ -4426,8 +4064,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: @@ -4445,8 +4081,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: @@ -4464,8 +4098,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4519,8 +4151,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: @@ -4538,8 +4168,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: @@ -4557,8 +4185,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: @@ -4576,8 +4202,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4608,8 +4232,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: @@ -4622,8 +4244,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -4653,8 +4273,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: @@ -4667,8 +4285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out @@ -4716,8 +4332,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: @@ -4732,8 +4346,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: @@ -4748,8 +4360,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: @@ -4764,8 +4374,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out @@ -4813,8 +4421,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: @@ -4829,8 +4435,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: @@ -4845,8 +4449,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: @@ -4861,8 +4463,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out @@ -4892,8 +4492,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: @@ -4906,8 +4504,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -4937,8 +4533,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: @@ -4951,8 +4545,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out @@ -5000,8 +4592,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: @@ -5016,8 +4606,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: @@ -5032,8 +4620,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: @@ -5048,8 +4634,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out @@ -5097,8 +4681,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: @@ -5113,8 +4695,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: @@ -5129,8 +4709,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: @@ -5145,8 +4723,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out @@ -5176,8 +4752,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: @@ -5190,8 +4764,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -5221,8 +4793,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: @@ -5235,8 +4805,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out @@ -5284,8 +4852,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: @@ -5300,8 +4866,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: @@ -5316,8 +4880,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: @@ -5332,8 +4894,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out @@ -5381,8 +4941,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: @@ -5397,8 +4955,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: @@ -5413,8 +4969,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: @@ -5429,8 +4983,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out @@ -5459,8 +5011,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: @@ -5473,8 +5023,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -5504,8 +5052,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: @@ -5518,8 +5064,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -5567,8 +5111,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: @@ -5583,8 +5125,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: @@ -5599,8 +5139,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: @@ -5615,8 +5153,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -5665,8 +5201,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: @@ -5682,8 +5216,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: @@ -5699,8 +5231,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: @@ -5716,8 +5246,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -5749,8 +5277,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: @@ -5763,8 +5289,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -5795,8 +5319,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: @@ -5809,8 +5331,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -5857,8 +5377,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: @@ -5873,8 +5391,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: @@ -5889,8 +5405,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: @@ -5905,8 +5419,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -5956,8 +5468,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: @@ -5973,8 +5483,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: @@ -5990,8 +5498,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: @@ -6007,8 +5513,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6055,8 +5559,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: @@ -6071,8 +5573,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: @@ -6086,8 +5586,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: @@ -6102,8 +5600,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -6147,8 +5643,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: @@ -6163,8 +5657,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: @@ -6178,8 +5670,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: @@ -6194,8 +5684,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6245,8 +5733,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: @@ -6261,8 +5747,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: @@ -6277,8 +5761,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: @@ -6293,8 +5775,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -6348,8 +5828,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: @@ -6366,8 +5844,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: @@ -6384,8 +5860,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: @@ -6402,8 +5876,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6435,8 +5907,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: @@ -6449,8 +5919,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -6481,8 +5949,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: @@ -6495,8 +5961,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6543,8 +6007,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: @@ -6559,8 +6021,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: @@ -6575,8 +6035,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: @@ -6591,8 +6049,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -6642,8 +6098,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: @@ -6659,8 +6113,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: @@ -6676,8 +6128,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: @@ -6693,8 +6143,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6727,8 +6175,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: @@ -6741,8 +6187,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -6773,8 +6217,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: @@ -6787,8 +6229,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -6835,8 +6275,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: @@ -6851,8 +6289,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: @@ -6867,8 +6303,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: @@ -6883,8 +6317,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -6934,8 +6366,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: @@ -6951,8 +6381,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: @@ -6968,8 +6396,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: @@ -6985,8 +6411,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7019,8 +6443,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: @@ -7033,8 +6455,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -7065,8 +6485,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: @@ -7079,8 +6497,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7127,8 +6543,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: @@ -7143,8 +6557,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: @@ -7159,8 +6571,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: @@ -7175,8 +6585,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -7226,8 +6634,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: @@ -7243,8 +6649,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: @@ -7260,8 +6664,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: @@ -7277,8 +6679,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7311,8 +6711,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: @@ -7325,8 +6723,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7356,8 +6752,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: @@ -7370,8 +6764,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7419,8 +6811,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: @@ -7435,8 +6825,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: @@ -7451,8 +6839,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: @@ -7467,8 +6853,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -7517,8 +6901,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: @@ -7534,8 +6916,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: @@ -7551,8 +6931,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: @@ -7568,8 +6946,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7601,8 +6977,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: @@ -7615,8 +6989,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -7647,8 +7019,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: @@ -7661,8 +7031,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7709,8 +7077,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: @@ -7725,8 +7091,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: @@ -7741,8 +7105,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: @@ -7757,8 +7119,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -7808,8 +7168,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: @@ -7825,8 +7183,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: @@ -7842,8 +7198,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: @@ -7859,8 +7213,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -7907,8 +7259,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: @@ -7923,8 +7273,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: @@ -7938,8 +7286,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: @@ -7954,8 +7300,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7999,8 +7343,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: @@ -8015,8 +7357,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: @@ -8030,8 +7370,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: @@ -8046,8 +7384,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8097,8 +7433,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: @@ -8113,8 +7447,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: @@ -8129,8 +7461,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: @@ -8145,8 +7475,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -8200,8 +7528,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: @@ -8218,8 +7544,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: @@ -8236,8 +7560,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: @@ -8254,8 +7576,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8287,8 +7607,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: @@ -8301,8 +7619,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -8333,8 +7649,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: @@ -8347,8 +7661,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8395,8 +7707,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: @@ -8411,8 +7721,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: @@ -8427,8 +7735,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: @@ -8443,8 +7749,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -8494,8 +7798,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: @@ -8511,8 +7813,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: @@ -8528,8 +7828,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: @@ -8545,8 +7843,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8579,8 +7875,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: @@ -8593,8 +7887,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -8625,8 +7917,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: @@ -8639,8 +7929,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8687,8 +7975,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: @@ -8703,8 +7989,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: @@ -8719,8 +8003,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: @@ -8735,8 +8017,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -8786,8 +8066,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: @@ -8803,8 +8081,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: @@ -8820,8 +8096,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: @@ -8837,8 +8111,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8871,8 +8143,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: @@ -8885,8 +8155,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -8917,8 +8185,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: @@ -8931,8 +8197,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float @@ -8979,8 +8243,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: @@ -8995,8 +8257,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: @@ -9011,8 +8271,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: @@ -9027,8 +8285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -9078,8 +8334,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: @@ -9095,8 +8349,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: @@ -9112,8 +8364,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: @@ -9129,8 +8379,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index a65143255bbb..59be9f8641c1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -17,8 +17,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: @@ -30,8 +28,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -48,8 +44,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: @@ -61,8 +55,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -79,8 +71,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: @@ -92,8 +82,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -110,8 +98,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: @@ -123,8 +109,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false) @@ -142,8 +126,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: @@ -155,8 +137,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -173,8 +153,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: @@ -186,8 +164,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -204,8 +180,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: @@ -217,8 +191,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -235,8 +207,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: @@ -248,8 +218,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -266,8 +234,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: @@ -279,8 +245,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -297,8 +261,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: @@ -310,8 +272,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -328,8 +288,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: @@ -341,8 +299,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false) @@ -360,8 +316,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: @@ -373,8 +327,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -391,8 +343,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: @@ -404,8 +354,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -422,8 +370,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: @@ -435,8 +381,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -455,8 +399,6 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: @@ -470,8 +412,6 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false) @@ -491,8 +431,6 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: @@ -506,8 +444,6 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -528,8 +464,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: @@ -545,8 +479,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false) @@ -566,8 +498,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: @@ -581,8 +511,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -603,8 +531,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: @@ -618,8 +544,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -640,8 +564,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: @@ -655,8 +577,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -677,8 +597,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: @@ -692,8 +610,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false) @@ -713,8 +629,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: @@ -728,8 +642,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -750,8 +662,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: @@ -767,8 +677,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false) @@ -788,8 +696,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: @@ -803,8 +709,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -825,8 +729,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: @@ -840,8 +742,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -862,8 +762,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: @@ -877,8 +775,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index abb2f8777818..216731519731 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -16,8 +16,6 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) store i32 %v, ptr addrspace(1) %out @@ -33,8 +31,6 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64(i32 99) store i32 %v, ptr addrspace(1) %out @@ -50,8 +46,6 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index afa3fe8c2f1f..22c369e2da72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -12,8 +12,6 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 ; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0) store ptr %v, ptr addrspace(1) %out @@ -42,8 +40,6 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[2:3] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0) store <3 x ptr> %v, ptr addrspace(1) %out @@ -61,8 +57,6 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0) store ptr addrspace(3) %v, ptr addrspace(1) %out @@ -85,8 +79,6 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0) store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out @@ -104,8 +96,6 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0) store ptr addrspace(5) %v, ptr addrspace(1) %out @@ -128,8 +118,6 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0) store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out @@ -147,8 +135,6 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0) store ptr addrspace(6) %v, ptr addrspace(1) %out @@ -171,8 +157,6 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0) store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index c951a07a44ae..afc5807e4f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -68,8 +68,6 @@ define amdgpu_cs void @test_quadmask_sgpr_i32(i32 inreg %mask, ptr addrspace(1) ; GFX11-NEXT: s_quadmask_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask) @@ -154,8 +152,6 @@ define amdgpu_cs void @test_quadmask_sgpr_i64(i64 inreg %mask, ptr addrspace(1) ; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll index af8023788d2e..6e24717a2827 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll @@ -51,8 +51,6 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_i8_tfe: @@ -64,8 +62,6 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i8, i32 } %res, 0 @@ -119,8 +115,6 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_i16_tfe: @@ -132,8 +126,6 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i16, i32 } %res, 0 @@ -187,8 +179,6 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_f16_tfe: @@ -200,8 +190,6 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { half, i32 } %res, 0 @@ -255,8 +243,6 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_i32_tfe: @@ -268,8 +254,6 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %res, 0 @@ -343,8 +327,6 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v2i32_tfe: @@ -356,8 +338,6 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %res, 0 @@ -431,8 +411,6 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v2f32_tfe: @@ -444,8 +422,6 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %res, 0 @@ -524,8 +500,6 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v3i32_tfe: @@ -538,8 +512,6 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %res, 0 @@ -618,8 +590,6 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v3f32_tfe: @@ -632,8 +602,6 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %res, 0 @@ -699,8 +667,6 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v4i32_tfe: @@ -713,8 +679,6 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %res, 0 @@ -780,8 +744,6 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_load_v4f32_tfe: @@ -794,8 +756,6 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %res, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 75da1adc3123..fd6e354b274a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -17,8 +17,6 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, < ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) @@ -36,8 +34,6 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; GFX11-LABEL: buffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) @@ -53,8 +49,6 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) @@ -86,8 +80,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, ; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) @@ -105,8 +97,6 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 % ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -122,8 +112,6 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -142,8 +130,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -174,8 +160,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 @@ -207,8 +191,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsr ; GFX11-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -234,8 +216,6 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i ; GFX11-LABEL: buffer_store_x2_offen_merged_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -255,8 +235,6 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 @@ -278,8 +256,6 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) @@ -299,8 +275,6 @@ define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x ; GFX11-LABEL: buffer_store_x2_offset_merged: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -321,8 +295,6 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) @@ -342,8 +314,6 @@ define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 @@ -363,8 +333,6 @@ define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 @@ -382,8 +350,6 @@ define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { ; GFX11-LABEL: raw_buffer_store_f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 @@ -410,8 +376,6 @@ define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %dat ; GFX11-LABEL: buffer_store_v2f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -440,8 +404,6 @@ define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %dat ; GFX11-LABEL: buffer_store_v4f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -457,8 +419,6 @@ define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { ; GFX11-LABEL: raw_buffer_store_i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 @@ -483,8 +443,6 @@ define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data ; GFX11-LABEL: buffer_store_v2i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -511,8 +469,6 @@ define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data ; GFX11-LABEL: buffer_store_v4i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) @@ -531,8 +487,6 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) @@ -563,8 +517,6 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> ; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 offset:16 ; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 ; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index 82dd35ab4c24..e1f84dcbaa60 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -31,8 +31,6 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d ; GFX11-LABEL: buffer_store_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -66,8 +64,6 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf ; GFX11-LABEL: buffer_store_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -105,8 +101,6 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf ; GFX11-LABEL: buffer_store_v4bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -158,8 +152,6 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf ; GFX11-LABEL: buffer_store_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index 30f04f1ff220..af84994ca8c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -42,8 +42,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -90,8 +88,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -146,8 +142,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -202,8 +196,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll index 22396631b38d..9440efefe18a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -28,8 +28,6 @@ define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x f ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -56,8 +54,6 @@ define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float> ; GFX11-LABEL: tbuffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -79,8 +75,6 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg, ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -102,8 +96,6 @@ define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vda ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -125,8 +117,6 @@ define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 @@ -148,8 +138,6 @@ define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> @@ -171,8 +159,6 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg % ; GFX11-LABEL: buffer_store_voffset_large_12bit: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 4092, i32 0, i32 63, i32 0) @@ -196,8 +182,6 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x1000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8188, i32 0, i32 63, i32 0) @@ -221,8 +205,6 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xf000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 65532, i32 0, i32 63, i32 0) @@ -246,8 +228,6 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x7ff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8388604, i32 0, i32 63, i32 0) @@ -271,8 +251,6 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xfff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 16777212, i32 0, i32 63, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index a241bdeaff1a..eb349cbbe44d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -44,8 +44,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: @@ -56,8 +54,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -104,8 +100,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: @@ -116,8 +110,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -172,8 +164,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: @@ -186,8 +176,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-SDAG-NEXT: s_nop 0 -; GFX12-PACKED-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-SDAG-NEXT: s_endpgm ; ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: @@ -201,8 +189,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-GISEL-NEXT: s_nop 0 -; GFX12-PACKED-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-GISEL-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -257,8 +243,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: @@ -270,8 +254,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll index 8641bf1b03f3..118fed169696 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -30,8 +30,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store: @@ -41,8 +39,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, ; GFX12-NEXT: tbuffer_store_format_xyzw v[4:7], off, s[0:3], null format:[BUF_FMT_32_32_32_32_UINT] th:TH_STORE_NT ; GFX12-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], null format:78 th:TH_STORE_HT ; GFX12-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], null format:78 th:TH_STORE_RT_NT -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -69,15 +65,11 @@ define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; GFX11-LABEL: tbuffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:117 offset:42 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -99,15 +91,11 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x f ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -129,15 +117,11 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:115 offen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -159,15 +143,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_x1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_x v0, off, s[0:3], null format:125 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 @@ -189,15 +169,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_x2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> @@ -219,15 +195,11 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, < ; GFX11-LABEL: buffer_store_voffset_large_12bit: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 4092, i32 0, i32 63, i32 0) @@ -251,15 +223,11 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, < ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x1000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8188 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8188, i32 0, i32 63, i32 0) @@ -283,15 +251,11 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, < ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xf000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:65532 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 65532, i32 0, i32 63, i32 0) @@ -315,15 +279,11 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, < ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x7ff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8388604 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8388604, i32 0, i32 63, i32 0) @@ -347,16 +307,12 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, < ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xfff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_24bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0x800000 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:8388604 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 16777212, i32 0, i32 63, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 6cc2393d598e..ce536a36b760 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -89,8 +89,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: uniform_value: @@ -102,8 +100,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: uniform_value: @@ -114,8 +110,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: uniform_value: @@ -126,8 +120,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1) @@ -199,8 +191,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: const_value: @@ -210,8 +200,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: const_value: @@ -220,8 +208,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: const_value: @@ -230,8 +216,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1) @@ -452,8 +436,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: divergent_value: @@ -476,8 +458,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: divergent_value: @@ -499,8 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: divergent_value: @@ -522,8 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -847,8 +823,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: divergent_cfg: @@ -886,8 +860,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: divergent_cfg: @@ -926,8 +898,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: divergent_cfg: @@ -964,8 +934,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index f7f8536219db..ffb27f402090 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -90,8 +90,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: uniform_value: @@ -103,8 +101,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: uniform_value: @@ -115,8 +111,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: uniform_value: @@ -127,8 +121,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 1) @@ -200,8 +192,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: const_value: @@ -211,8 +201,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: const_value: @@ -221,8 +209,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: const_value: @@ -231,8 +217,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 123, i32 1) @@ -453,8 +437,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: divergent_value: @@ -477,8 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: divergent_value: @@ -500,8 +480,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: divergent_value: @@ -523,8 +501,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %id.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -848,8 +824,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1164DAGISEL-NEXT: s_nop 0 -; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: divergent_cfg: @@ -887,8 +861,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1164GISEL-NEXT: s_nop 0 -; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: divergent_cfg: @@ -927,8 +899,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1132DAGISEL-NEXT: s_nop 0 -; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: divergent_cfg: @@ -965,8 +935,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX1132GISEL-NEXT: s_nop 0 -; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index d521a6c25e46..8d99ec2e1b70 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -110,8 +110,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] -; VARIANT4-NEXT: s_nop 0 -; VARIANT4-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT4-NEXT: s_endpgm ; ; VARIANT5-LABEL: test_barrier: @@ -134,8 +132,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] -; VARIANT5-NEXT: s_nop 0 -; VARIANT5-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT5-NEXT: s_endpgm ; ; VARIANT6-LABEL: test_barrier: @@ -160,8 +156,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] -; VARIANT6-NEXT: s_nop 0 -; VARIANT6-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT6-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 1e13b40afb8b..61baca24fbdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -17,8 +17,6 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: s_barrier_signal -1 ; GFX12-SDAG-NEXT: s_barrier_wait -1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_signal: @@ -35,8 +33,6 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_barrier_wait -1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -65,8 +61,6 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: s_barrier_signal 1 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test2_s_barrier_signal: @@ -83,8 +77,6 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: s_barrier_signal 1 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -113,8 +105,6 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: s_barrier_signal 0 ; GFX12-SDAG-NEXT: s_barrier_wait 0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test3_s_barrier_signal: @@ -131,8 +121,6 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: s_barrier_signal 0 ; GFX12-GISEL-NEXT: s_barrier_wait 0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -162,8 +150,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_signal_var: @@ -182,8 +168,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -247,8 +231,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst: @@ -272,8 +254,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -307,8 +287,6 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst: @@ -332,8 +310,6 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -367,8 +343,6 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst: @@ -392,8 +366,6 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -428,8 +400,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var: @@ -454,8 +424,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -557,8 +525,6 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_init -1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_init: @@ -574,8 +540,6 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_init -1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -603,8 +567,6 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_init 1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test2_s_barrier_init: @@ -620,8 +582,6 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_init 1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -649,8 +609,6 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_init 0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test3_s_barrier_init: @@ -666,8 +624,6 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_init 0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -697,8 +653,6 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_init m0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test4_s_barrier_init: @@ -715,8 +669,6 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; GFX12-GISEL-NEXT: s_or_b32 m0, s2, s3 ; GFX12-GISEL-NEXT: s_barrier_init m0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -779,8 +731,6 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_barrier_join -1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_join: @@ -795,8 +745,6 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_join -1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -823,8 +771,6 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_barrier_join 1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test2_s_barrier_join: @@ -839,8 +785,6 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_join 1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -867,8 +811,6 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_barrier_join 0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test3_s_barrier_join: @@ -883,8 +825,6 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_join 0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -911,8 +851,6 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test4_s_barrier_join_m0: @@ -928,8 +866,6 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_barrier_join m0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1003,8 +939,6 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_barrier_leave: @@ -1027,8 +961,6 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1057,8 +989,6 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_wakeup_barrier -1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test1_s_wakeup_barrier: @@ -1073,8 +1003,6 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_wakeup_barrier -1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1101,8 +1029,6 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_wakeup_barrier 1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test2_s_wakeup_barrier: @@ -1117,8 +1043,6 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_wakeup_barrier 1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1145,8 +1069,6 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-SDAG-NEXT: s_wakeup_barrier 0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test3_s_wakeup_barrier: @@ -1161,8 +1083,6 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_wakeup_barrier 0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1189,8 +1109,6 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] ; GFX12-SDAG-NEXT: s_wakeup_barrier m0 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0: @@ -1206,8 +1124,6 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] ; GFX12-GISEL-NEXT: s_wakeup_barrier m0 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1262,8 +1178,6 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1287,8 +1201,6 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1312,8 +1224,6 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1339,8 +1249,6 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -1419,8 +1327,6 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GFX12-SDAG-NEXT: s_barrier_signal -1 ; GFX12-SDAG-NEXT: s_barrier_wait -1 ; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: test_barrier_convert: @@ -1437,8 +1343,6 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_barrier_wait -1 ; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index d085b3c768a8..1dbcc21500bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -736,8 +736,6 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_buffer_load_index_across_bb: @@ -755,8 +753,6 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %tmp = shl i32 %index, 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 363c54d4abe9..a9823d89048b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -10,8 +10,6 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_get_doorbell: @@ -21,8 +19,6 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128) store i32 %ret, ptr addrspace(1) %out @@ -37,8 +33,6 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_get_ddid: @@ -48,8 +42,6 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129) store i32 %ret, ptr addrspace(1) %out @@ -65,8 +57,6 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130) store i64 %ret, ptr addrspace(1) %out @@ -82,8 +72,6 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131) store i64 %ret, ptr addrspace(1) %out @@ -98,8 +86,6 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_savewave: @@ -109,8 +95,6 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132) store i32 %ret, ptr addrspace(1) %out @@ -126,8 +110,6 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133) store i64 %ret, ptr addrspace(1) %out @@ -142,8 +124,6 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_get_0_i32: @@ -153,8 +133,6 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0) store i32 %ret, ptr addrspace(1) %out @@ -170,8 +148,6 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999) store i64 %ret, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll index c99a082afe2d..60c04749c9b7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll @@ -51,8 +51,6 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_i8_tfe: @@ -64,8 +62,6 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i8, i32 } %res, 0 @@ -119,8 +115,6 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_i16_tfe: @@ -132,8 +126,6 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i16, i32 } %res, 0 @@ -187,8 +179,6 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_f16_tfe: @@ -200,8 +190,6 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { half, i32 } %res, 0 @@ -255,8 +243,6 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_i32_tfe: @@ -268,8 +254,6 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %res, 0 @@ -343,8 +327,6 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v2i32_tfe: @@ -356,8 +338,6 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %res, 0 @@ -431,8 +411,6 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v2f32_tfe: @@ -444,8 +422,6 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %res, 0 @@ -524,8 +500,6 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v3i32_tfe: @@ -538,8 +512,6 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %res, 0 @@ -618,8 +590,6 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v3f32_tfe: @@ -632,8 +602,6 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %res, 0 @@ -699,8 +667,6 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v4i32_tfe: @@ -713,8 +679,6 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %res, 0 @@ -780,8 +744,6 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_buffer_load_v4f32_tfe: @@ -794,8 +756,6 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %res, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 94c9f7ab8e75..87b83f68d685 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, < ; GFX11-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen ; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc ; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) @@ -40,8 +38,6 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) @@ -57,8 +53,6 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) @@ -80,8 +74,6 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) @@ -97,8 +89,6 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) @@ -116,8 +106,6 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) @@ -149,8 +137,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, ; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) @@ -168,8 +154,6 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 % ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -185,8 +169,6 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -209,8 +191,6 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i ; GFX11-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen ; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc ; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) @@ -230,8 +210,6 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1 ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 @@ -251,8 +229,6 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v2 = fptrunc float %v1 to half call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -277,8 +253,6 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal ; GFX11-LABEL: struct_buffer_store_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void @@ -306,8 +280,6 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal ; GFX11-LABEL: struct_buffer_store_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void @@ -324,8 +296,6 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 @@ -351,8 +321,6 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16 ; GFX11-LABEL: struct_buffer_store_vif16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void @@ -378,8 +346,6 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16 ; GFX11-LABEL: struct_buffer_store_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 38fdcf47171a..d625dc17286a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -46,8 +46,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -98,8 +96,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -162,8 +158,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -226,8 +220,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll index f84422efcf99..753d17aee546 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll @@ -40,8 +40,6 @@ define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x f ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -77,8 +75,6 @@ define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float> ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -109,8 +105,6 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg, ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -137,8 +131,6 @@ define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float> %vda ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -177,8 +169,6 @@ define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vda ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -205,8 +195,6 @@ define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float> %vd ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -246,8 +234,6 @@ define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float> %vd ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -277,8 +263,6 @@ define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 @@ -305,8 +289,6 @@ define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> @@ -337,8 +319,6 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0) @@ -376,8 +356,6 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg % ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0) @@ -415,8 +393,6 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg % ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0) @@ -454,8 +430,6 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg % ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0) @@ -493,8 +467,6 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg % ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 1da076c65239..e4199e199feb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -49,8 +49,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: @@ -62,8 +60,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -114,8 +110,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: @@ -127,8 +121,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -191,8 +183,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: @@ -206,8 +196,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-SDAG-NEXT: s_nop 0 -; GFX12-PACKED-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-SDAG-NEXT: s_endpgm ; ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: @@ -221,8 +209,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-GISEL-NEXT: s_nop 0 -; GFX12-PACKED-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-GISEL-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -285,8 +271,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-NEXT: s_nop 0 -; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: @@ -299,8 +283,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-NEXT: s_nop 0 -; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll index 47b7658f50cc..ab0f18938298 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -41,8 +41,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store: @@ -53,8 +51,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, ; GFX12-NEXT: tbuffer_store_format_xyzw v[4:7], v12, s[0:3], null format:[BUF_FMT_32_32_32_32_UINT] idxen th:TH_STORE_NT ; GFX12-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], null format:78 idxen th:TH_STORE_HT ; GFX12-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], null format:78 idxen th:TH_STORE_RT_NT -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -90,16 +86,12 @@ define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:117 idxen offset:42 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -130,16 +122,12 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x f ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -166,15 +154,11 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_8_8_8_8_SINT] idxen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -213,16 +197,12 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:115 idxen offen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -249,15 +229,11 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i3 ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:70 idxen offen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -297,8 +273,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3 ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_wait: @@ -307,8 +281,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3 ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], null idxen ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], null format:[BUF_FMT_8_8_8_8_UINT] idxen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -338,15 +310,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 % ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_x1: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_x v0, v1, s[0:3], null format:125 idxen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %data.i = bitcast float %data to i32 @@ -373,15 +341,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_x2: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: %data.i = bitcast <2 x float> %data to <2 x i32> @@ -412,16 +376,12 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, < ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0) @@ -459,16 +419,12 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, < ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8188 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0) @@ -506,16 +462,12 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, < ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:65532 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0) @@ -553,16 +505,12 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, < ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8388604 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0) @@ -600,16 +548,12 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, < ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_voffset_large_24bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v5, 0x800000 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll index 5ea89bc57491..d44d2a4f7338 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll @@ -23,8 +23,6 @@ define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %waveid = call i32 @llvm.amdgcn.wave.id() store i32 %waveid, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll index 3d5e6dca4561..3874a456590d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) @@ -37,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) @@ -55,8 +51,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) @@ -71,8 +65,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) @@ -91,8 +83,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) @@ -118,8 +108,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) @@ -138,8 +126,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) @@ -154,8 +140,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) @@ -174,8 +158,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) @@ -201,8 +183,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) @@ -221,8 +201,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -237,8 +215,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -253,8 +229,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -269,8 +243,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -285,8 +257,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -301,8 +271,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -317,8 +285,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -333,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -351,8 +315,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -367,8 +329,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -383,8 +343,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -399,8 +357,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -416,8 +372,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -432,8 +386,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -448,8 +400,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -464,8 +414,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll index d11aaf0036b3..25adc25d7176 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -17,8 +17,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) @@ -33,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) @@ -49,8 +45,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) @@ -63,8 +57,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) @@ -79,8 +71,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) @@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) @@ -120,8 +108,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) @@ -134,8 +120,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) @@ -150,8 +134,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) @@ -173,8 +155,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) @@ -191,8 +171,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -206,8 +184,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -220,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -234,8 +208,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -248,8 +220,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -262,8 +232,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -276,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -290,8 +256,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -306,8 +270,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -320,8 +282,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -334,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -348,8 +306,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -362,8 +318,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -376,8 +330,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -390,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -404,8 +354,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll index d51efc2b275d..0418f32c3f73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll @@ -64,8 +64,6 @@ define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %ou ; GFX11-NEXT: s_wqm_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) @@ -145,8 +143,6 @@ define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %ou ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 9d93ca65683c..837d484583d5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -48,8 +48,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: @@ -88,8 +86,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval) @@ -145,8 +141,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i64: @@ -196,8 +190,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i64, ptr addrspace(1) %out %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) @@ -253,8 +245,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_f64: @@ -304,8 +294,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load double, ptr addrspace(1) %out %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) @@ -354,8 +342,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s2 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: @@ -398,8 +384,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s2 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval) @@ -454,8 +438,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: @@ -504,8 +486,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i64, ptr addrspace(1) %out %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval) @@ -565,8 +545,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: @@ -619,8 +597,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s2, s4 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load double, ptr addrspace(1) %out %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval) @@ -686,8 +662,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 12, s2 ; GFX1100-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: @@ -748,8 +722,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2 ; GFX1100-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid @@ -825,8 +797,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 12, s3 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: @@ -890,8 +860,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v2, 0, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid @@ -972,8 +940,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: @@ -1041,8 +1007,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v2, s3, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid @@ -1108,8 +1072,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s2 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: @@ -1163,8 +1125,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s2 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() @@ -1214,8 +1174,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_i32: @@ -1258,8 +1216,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0 @@ -1309,8 +1265,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_i64: @@ -1354,8 +1308,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i64, ptr addrspace(1) %out %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0 @@ -1405,8 +1357,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_f64: @@ -1450,8 +1400,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load double, ptr addrspace(1) %out %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0 @@ -1495,8 +1443,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: @@ -1534,8 +1480,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 @@ -1587,8 +1531,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: @@ -1635,8 +1577,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) store i64 %writelane, ptr addrspace(1) %out, align 4 @@ -1688,8 +1628,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: @@ -1736,8 +1674,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) store double %writelane, ptr addrspace(1) %out, align 4 @@ -1775,8 +1711,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: @@ -1809,8 +1743,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42) store i32 %writelane, ptr addrspace(1) %out, align 4 @@ -1859,8 +1791,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: @@ -1904,8 +1834,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42) store i64 %writelane, ptr addrspace(1) %out, align 4 @@ -1954,8 +1882,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: @@ -1999,8 +1925,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0) store double %writelane, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 4faa482ede59..42a59ec7bccb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -62,8 +62,6 @@ define amdgpu_kernel void @ceil_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: ceil_f16: @@ -82,8 +80,6 @@ define amdgpu_kernel void @ceil_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -171,8 +167,6 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: ceil_v2f16: @@ -195,8 +189,6 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 0b18e5f35a31..62bdef89b5a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -79,8 +79,6 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cos_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.cos.f16(half %a.val) @@ -189,8 +187,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 61f6c9f7f0e6..6036150e1890 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -62,8 +62,6 @@ define amdgpu_kernel void @floor_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: floor_f16: @@ -82,8 +80,6 @@ define amdgpu_kernel void @floor_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -172,8 +168,6 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: floor_v2f16: @@ -196,8 +190,6 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-FAKE16-NEXT: v_floor_f16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 92658e660ea6..10f87e74f39d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -189,8 +189,6 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_f16: @@ -219,8 +217,6 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 ; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -381,8 +377,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: @@ -407,8 +401,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -567,8 +559,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: @@ -593,8 +583,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -807,8 +795,6 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_nop 0 -; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-LABEL: fmuladd_v2f16: @@ -837,8 +823,6 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_nop 0 -; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll index 2bb4cc617e7f..ccd30d3d8bea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll @@ -147,8 +147,6 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fpmode = call i32 @llvm.get.fpmode.i32() store i32 %fpmode, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index f86c8294ab3c..86311ab85925 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -79,8 +79,6 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_nop 0 -; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3) %sext = sext i1 %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 3d8e9e609730..a577fb3d190a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -86,8 +86,6 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_nop 0 -; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index e05f3f1e65ff..96551d5bf785 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -81,8 +81,6 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_nop 0 -; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan %sext = sext i1 %result to i32 @@ -166,8 +164,6 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_nop 0 -; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan %sext = sext i1 %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 7f4cf19e9b85..8f28208945fb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -208,8 +208,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_f32: @@ -236,8 +234,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_f32: @@ -595,8 +591,6 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v2f32: @@ -630,8 +624,6 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_v2f32: @@ -1136,8 +1128,6 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v3f32: @@ -1192,8 +1182,6 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_v3f32: @@ -1826,8 +1814,6 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v4f32: @@ -1888,8 +1874,6 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 1c64e6b76c95..e80d8e3bfb38 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -208,8 +208,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_f32: @@ -236,8 +234,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_f32: @@ -595,8 +591,6 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: @@ -630,8 +624,6 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_v2f32: @@ -1136,8 +1128,6 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v3f32: @@ -1192,8 +1182,6 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_v3f32: @@ -1826,8 +1814,6 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v4f32: @@ -1888,8 +1874,6 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 50c52037dc4d..6578311178ab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -142,8 +142,6 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_f32: @@ -162,8 +160,6 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_f32: @@ -379,8 +375,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 ; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: @@ -403,8 +397,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v2f32: @@ -686,8 +678,6 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v3f32: @@ -719,8 +709,6 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v3f32: @@ -1065,8 +1053,6 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] -; GFX1100-SDAG-NEXT: s_nop 0 -; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v4f32: @@ -1103,8 +1089,6 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1100-GISEL-NEXT: s_nop 0 -; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 2c9ce001b8c4..1daa45285e68 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -138,8 +138,6 @@ define amdgpu_kernel void @maxnum_f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -248,8 +246,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { @@ -356,8 +352,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -469,8 +463,6 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -561,8 +553,6 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { @@ -651,8 +641,6 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -787,8 +775,6 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -934,8 +920,6 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1053,8 +1037,6 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 ; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 59508e049a3a..00cba8c77fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -137,8 +137,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -274,8 +272,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { @@ -381,8 +377,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -493,8 +487,6 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -619,8 +611,6 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { @@ -708,8 +698,6 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -843,8 +831,6 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -989,8 +975,6 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1107,8 +1091,6 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 ; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 2d3b18dcb121..7e3158bd1106 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -448,8 +448,6 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: umulo_i64_s: @@ -476,8 +474,6 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; GFX12-NEXT: s_cselect_b32 s1, 0, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) @@ -661,8 +657,6 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: smulo_i64_s: @@ -703,8 +697,6 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index bf7dbcde62fd..4fc401ff20ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -80,8 +80,6 @@ define amdgpu_kernel void @rint_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rndne_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -187,8 +185,6 @@ define amdgpu_kernel void @rint_v2f16( ; GFX11-NEXT: v_rndne_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index e7b17c30cf75..5347f0f00c31 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -78,8 +78,6 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_f32: @@ -209,8 +207,6 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_v2f32: @@ -393,8 +389,6 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4 ; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_v4f32: @@ -709,8 +703,6 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_v8f32: @@ -853,8 +845,6 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_f16: @@ -1007,8 +997,6 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 70f15bd0aa61..d618b9379102 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -1779,8 +1779,6 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() { ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm tail call void @llvm.set.rounding(i32 1) %set.mode = tail call i32 @llvm.get.rounding() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 3ae0cf65eb00..a177a61823dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -79,8 +79,6 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sin_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.sin.f16(half %a.val) @@ -189,8 +187,6 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index c69ebedbec50..21c4455565db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -61,8 +61,6 @@ define amdgpu_kernel void @sqrt_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -148,8 +146,6 @@ define amdgpu_kernel void @sqrt_v2f16( ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 11f5e6ebf999..623db04a0e90 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -61,8 +61,6 @@ define amdgpu_kernel void @trunc_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_trunc_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -148,8 +146,6 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX11-NEXT: v_trunc_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll index 668ebe3f953b..b88266981a25 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll @@ -22,8 +22,6 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or3_b32 v2, s2, v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_uniform_load_b96: @@ -43,8 +41,6 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %i = zext i32 %arg to i64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index 029c4e51e299..1afd1786569b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; GFX12-NEXT: s_add_f32 s0, s7, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NEXT: global_store_b32 v0, v1, s[10:11] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %out_ptr.promoted = load float, ptr addrspace(1) %out_ptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 7202ab8b3146..e8115a3db557 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -54,8 +54,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load double, ptr addrspace(4) %in store double %ld, ptr addrspace(1) %out @@ -155,8 +153,6 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], s[14:15], v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[18:19] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 32abe50ff04d..d8b4fadeebba 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -73,8 +73,6 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %in store i1 %load, ptr addrspace(1) %out @@ -146,8 +144,6 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in store <2 x i1> %load, ptr addrspace(1) %out @@ -218,8 +214,6 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in store <3 x i1> %load, ptr addrspace(1) %out @@ -291,8 +285,6 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in store <4 x i1> %load, ptr addrspace(1) %out @@ -364,8 +356,6 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in store <8 x i1> %load, ptr addrspace(1) %out @@ -437,8 +427,6 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in store <16 x i1> %load, ptr addrspace(1) %out @@ -494,8 +482,6 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in store <32 x i1> %load, ptr addrspace(1) %out @@ -554,8 +540,6 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in store <64 x i1> %load, ptr addrspace(1) %out @@ -617,8 +601,6 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = zext i1 %a to i32 @@ -686,8 +668,6 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = sext i1 %a to i32 @@ -750,8 +730,6 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = zext <1 x i1> %load to <1 x i32> @@ -819,8 +797,6 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = sext <1 x i1> %load to <1 x i32> @@ -894,8 +870,6 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = zext <2 x i1> %load to <2 x i32> @@ -969,8 +943,6 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = sext <2 x i1> %load to <2 x i32> @@ -1054,8 +1026,6 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_bfe_u32 v1, v1, 1, 1 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i32> @@ -1138,8 +1108,6 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = sext <3 x i1> %load to <3 x i32> @@ -1223,8 +1191,6 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i32> @@ -1311,8 +1277,6 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = sext <4 x i1> %load to <4 x i32> @@ -1438,8 +1402,6 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = zext <8 x i1> %load to <8 x i32> @@ -1561,8 +1523,6 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = sext <8 x i1> %load to <8 x i32> @@ -1762,8 +1722,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = zext <16 x i1> %load to <16 x i32> @@ -1957,8 +1915,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = sext <16 x i1> %load to <16 x i32> @@ -2328,8 +2284,6 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = zext <32 x i1> %load to <32 x i32> @@ -2732,8 +2686,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = sext <32 x i1> %load to <32 x i32> @@ -3436,8 +3388,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i32> @@ -4200,8 +4150,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = sext <64 x i1> %load to <64 x i32> @@ -4271,8 +4219,6 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = zext i1 %a to i64 @@ -4343,8 +4289,6 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i1, ptr addrspace(4) %in %ext = sext i1 %a to i64 @@ -4414,8 +4358,6 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = zext <1 x i1> %load to <1 x i64> @@ -4486,8 +4428,6 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i1>, ptr addrspace(4) %in %ext = sext <1 x i1> %load to <1 x i64> @@ -4570,8 +4510,6 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = zext <2 x i1> %load to <2 x i64> @@ -4656,8 +4594,6 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = sext <2 x i1> %load to <2 x i64> @@ -4761,8 +4697,6 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i64> @@ -4873,8 +4807,6 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = sext <3 x i1> %load to <3 x i64> @@ -4991,8 +4923,6 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i64> @@ -5116,8 +5046,6 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = sext <4 x i1> %load to <4 x i64> @@ -5282,8 +5210,6 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = zext <8 x i1> %load to <8 x i64> @@ -5489,8 +5415,6 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = sext <8 x i1> %load to <8 x i64> @@ -5781,8 +5705,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = zext <16 x i1> %load to <16 x i64> @@ -6144,8 +6066,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = sext <16 x i1> %load to <16 x i64> @@ -6675,8 +6595,6 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = zext <32 x i1> %load to <32 x i64> @@ -7385,8 +7303,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = sext <32 x i1> %load to <32 x i64> @@ -8415,8 +8331,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i64> @@ -9805,8 +9719,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = sext <64 x i1> %load to <64 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 2ee1c60b4bbf..7b2ccb60d142 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -83,8 +83,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load i16, ptr addrspace(4) %in @@ -153,8 +151,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i16>, ptr addrspace(4) %in @@ -261,8 +257,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in @@ -335,8 +329,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i16>, ptr addrspace(4) %in @@ -416,8 +408,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i16>, ptr addrspace(4) %in @@ -534,8 +524,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %in @@ -774,8 +762,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2 @@ -851,8 +837,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = zext i16 %a to i32 @@ -929,8 +913,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = sext i16 %a to i32 @@ -1006,8 +988,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = zext <1 x i16> %load to <1 x i32> @@ -1084,8 +1064,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = sext <1 x i16> %load to <1 x i32> @@ -1170,8 +1148,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = zext <2 x i16> %load to <2 x i32> @@ -1258,8 +1234,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = sext <2 x i16> %load to <2 x i32> @@ -1356,8 +1330,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in @@ -1458,8 +1430,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i16>, ptr addrspace(4) %in @@ -1566,8 +1536,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = zext <4 x i16> %load to <4 x i32> @@ -1675,8 +1643,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i32> @@ -1835,8 +1801,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = zext <8 x i16> %load to <8 x i32> @@ -1997,8 +1961,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = sext <8 x i16> %load to <8 x i32> @@ -2261,8 +2223,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = zext <16 x i16> %load to <16 x i32> @@ -2529,8 +2489,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = sext <16 x i16> %load to <16 x i32> @@ -3010,8 +2968,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = zext <32 x i16> %load to <32 x i32> @@ -3501,8 +3457,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = sext <32 x i16> %load to <32 x i32> @@ -4424,8 +4378,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(4) %in %ext = zext <64 x i16> %load to <64 x i32> @@ -5363,8 +5315,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(4) %in %ext = sext <64 x i16> %load to <64 x i32> @@ -5446,8 +5396,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = zext i16 %a to i64 @@ -5538,8 +5486,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = sext i16 %a to i64 @@ -5621,8 +5567,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = zext <1 x i16> %load to <1 x i64> @@ -5708,8 +5652,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = sext <1 x i16> %load to <1 x i64> @@ -5803,8 +5745,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = zext <2 x i16> %load to <2 x i64> @@ -5903,8 +5843,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = sext <2 x i16> %load to <2 x i64> @@ -6036,8 +5974,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = zext <4 x i16> %load to <4 x i64> @@ -6186,8 +6122,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i64> @@ -6397,8 +6331,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = zext <8 x i16> %load to <8 x i64> @@ -6645,8 +6577,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(4) %in %ext = sext <8 x i16> %load to <8 x i64> @@ -7013,8 +6943,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = zext <16 x i16> %load to <16 x i64> @@ -7462,8 +7390,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = sext <16 x i16> %load to <16 x i64> @@ -8149,8 +8075,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = zext <32 x i16> %load to <32 x i64> @@ -9005,8 +8929,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(4) %in %ext = sext <32 x i16> %load to <32 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 4ab55164e099..72d19a147568 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -79,8 +79,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load i32, ptr addrspace(4) %in @@ -165,8 +163,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i32>, ptr addrspace(4) %in @@ -262,8 +258,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i32>, ptr addrspace(4) %in @@ -357,8 +351,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i32>, ptr addrspace(4) %in @@ -489,8 +481,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i32>, ptr addrspace(4) %in @@ -651,8 +641,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b32 v8, v9, s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <9 x i32>, ptr addrspace(4) %in @@ -819,8 +807,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v10, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <10 x i32>, ptr addrspace(4) %in @@ -998,8 +984,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <11 x i32>, ptr addrspace(4) %in @@ -1175,8 +1159,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v12, v[8:11], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <12 x i32>, ptr addrspace(4) %in @@ -1383,8 +1365,6 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i32>, ptr addrspace(4) %in @@ -1468,8 +1448,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load i32, ptr addrspace(4) %in %ext = zext i32 %ld to i64 @@ -1561,8 +1539,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load i32, ptr addrspace(4) %in %ext = sext i32 %ld to i64 @@ -1646,8 +1622,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <1 x i32>, ptr addrspace(4) %in %ext = zext <1 x i32> %ld to <1 x i64> @@ -1739,8 +1713,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <1 x i32>, ptr addrspace(4) %in %ext = sext <1 x i32> %ld to <1 x i64> @@ -1837,8 +1809,6 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <2 x i32>, ptr addrspace(4) %in %ext = zext <2 x i32> %ld to <2 x i64> @@ -1951,8 +1921,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <2 x i32>, ptr addrspace(4) %in %ext = sext <2 x i32> %ld to <2 x i64> @@ -2081,8 +2049,6 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <4 x i32>, ptr addrspace(4) %in %ext = zext <4 x i32> %ld to <4 x i64> @@ -2242,8 +2208,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <4 x i32>, ptr addrspace(4) %in %ext = sext <4 x i32> %ld to <4 x i64> @@ -2441,8 +2405,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <8 x i32>, ptr addrspace(4) %in %ext = zext <8 x i32> %ld to <8 x i64> @@ -2709,8 +2671,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <8 x i32>, ptr addrspace(4) %in %ext = sext <8 x i32> %ld to <8 x i64> @@ -3189,8 +3149,6 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v28, v[20:23], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(4) %in %ext = sext <16 x i32> %ld to <16 x i64> @@ -3532,8 +3490,6 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(4) %in %ext = zext <16 x i32> %ld to <16 x i64> @@ -4448,8 +4404,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in %ext = sext <32 x i32> %ld to <32 x i64> @@ -5087,8 +5041,6 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in %ext = zext <32 x i32> %ld to <32 x i64> @@ -5459,8 +5411,6 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32 ; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16 ; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(4) %in store <32 x i32> %ld, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 46c7c2b08cd6..7fd5909f3b2b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -70,8 +70,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load i64, ptr addrspace(4) %in store i64 %ld, ptr addrspace(1) %out @@ -150,8 +148,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i64>, ptr addrspace(4) %in @@ -266,8 +262,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i64>, ptr addrspace(4) %in @@ -384,8 +378,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i64>, ptr addrspace(4) %in @@ -576,8 +568,6 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i64>, ptr addrspace(4) %in @@ -931,8 +921,6 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32 ; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16 ; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i64>, ptr addrspace(4) %in diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index ce17c81a24dd..f14d4afbee9d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -84,8 +84,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load i8, ptr addrspace(4) %in @@ -171,8 +169,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <2 x i8>, ptr addrspace(4) %in @@ -286,8 +282,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in @@ -356,8 +350,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <4 x i8>, ptr addrspace(4) %in @@ -430,8 +422,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <8 x i8>, ptr addrspace(4) %in @@ -511,8 +501,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <16 x i8>, ptr addrspace(4) %in @@ -588,8 +576,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i32 @@ -666,8 +652,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in %ext = sext i8 %ld to i32 @@ -743,8 +727,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i32> @@ -821,8 +803,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i32> @@ -922,8 +902,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i32> @@ -1021,8 +999,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i32> @@ -1123,8 +1099,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in @@ -1226,8 +1200,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm entry: %ld = load <3 x i8>, ptr addrspace(4) %in @@ -1333,8 +1305,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i32> @@ -1442,8 +1412,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i32> @@ -1603,8 +1571,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i32> @@ -1767,8 +1733,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i32> @@ -2035,8 +1999,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i32> @@ -2311,8 +2273,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i32> @@ -2795,8 +2755,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i32> @@ -3302,8 +3260,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i32> @@ -4221,8 +4177,6 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i8>, ptr addrspace(4) %in %ext = zext <64 x i8> %load to <64 x i32> @@ -5180,8 +5134,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <64 x i8>, ptr addrspace(4) %in %ext = sext <64 x i8> %load to <64 x i32> @@ -5263,8 +5215,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i64 @@ -5351,8 +5301,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = sext i8 %a to i64 @@ -5432,8 +5380,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i64> @@ -5520,8 +5466,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i64> @@ -5628,8 +5572,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i64> @@ -5742,8 +5684,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i64> @@ -5877,8 +5817,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i64> @@ -6031,8 +5969,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i64> @@ -6245,8 +6181,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i64> @@ -6502,8 +6436,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i64> @@ -6875,8 +6807,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i64> @@ -7339,8 +7269,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8034,8 +7962,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i64> @@ -8933,8 +8859,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i64> @@ -9035,8 +8959,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i16 @@ -9123,8 +9045,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = sext i8 %a to i16 @@ -9209,8 +9129,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = zext <1 x i8> %load to <1 x i16> @@ -9297,8 +9215,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <1 x i8>, ptr addrspace(4) %in %ext = sext <1 x i8> %load to <1 x i16> @@ -9396,8 +9312,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = zext <2 x i8> %load to <2 x i16> @@ -9512,8 +9426,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %in %ext = sext <2 x i8> %load to <2 x i16> @@ -9645,8 +9557,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = zext <4 x i8> %load to <4 x i16> @@ -9797,8 +9707,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <4 x i8>, ptr addrspace(4) %in %ext = sext <4 x i8> %load to <4 x i16> @@ -9995,8 +9903,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = zext <8 x i8> %load to <8 x i16> @@ -10231,8 +10137,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i16> @@ -10576,8 +10480,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = zext <16 x i8> %load to <16 x i16> @@ -11001,8 +10903,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i16> @@ -11638,8 +11538,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = zext <32 x i8> %load to <32 x i16> @@ -12439,8 +12337,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 08b089a32d1d..22bb01ba2be1 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7103,8 +7103,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: local_ds_fadd: @@ -7279,8 +7277,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: local_ds_fadd: @@ -7971,8 +7967,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: local_ds_fadd_one_as: @@ -8138,8 +8132,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: local_ds_fadd_one_as: diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index df954f6f940c..f2dcd151e5b5 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -71,8 +71,6 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %for.end -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 @@ -118,8 +116,6 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: .LBB2_3: ; %for.end -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm entry: %cmp6.not = icmp eq i32 %n, 0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index acba2841a710..79dfabe3b545 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -54,8 +54,6 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: workgroup_ids_kernel: @@ -66,8 +64,6 @@ define amdgpu_kernel void @workgroup_ids_kernel() { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm .entry: %idx = call i32 @llvm.amdgcn.workgroup.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll index 1da05ed264a6..356439bad8f0 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll @@ -12,6 +12,16 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; +; GFX9-SDAG-LABEL: _amdgpu_cs_main: +; GFX9-SDAG: ; %bb.0: ; %.entry +; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: _amdgpu_cs_main: +; GFX9-GISEL: ; %bb.0: ; %.entry +; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry ; GFX9ARCH-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 @@ -40,8 +50,6 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: _amdgpu_cs_main: @@ -52,8 +60,6 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm .entry: %idx = call i32 @llvm.amdgcn.workgroup.id.x() @@ -67,6 +73,38 @@ define amdgpu_cs void @_amdgpu_cs_main() { } define amdgpu_cs void @caller() { +; GFX9-SDAG-LABEL: caller: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9-SDAG-NEXT: s_mov_b32 s8, s0 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, callee@abs32@hi +; GFX9-SDAG-NEXT: s_mov_b32 s4, callee@abs32@lo +; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_add_u32 s8, s8, s0 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: caller: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9-GISEL-NEXT: s_mov_b32 s8, s0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 +; GFX9-GISEL-NEXT: s_mov_b32 s4, callee@abs32@lo +; GFX9-GISEL-NEXT: s_mov_b32 s5, callee@abs32@hi +; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_add_u32 s8, s8, s0 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-GISEL-NEXT: s_endpgm +; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: ; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] @@ -129,20 +167,20 @@ declare amdgpu_gfx void @callee(i32) define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) { ; GFX9-SDAG-LABEL: workgroup_ids_gfx: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v[2:3], v0, off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v[4:5], v0, off -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v[2:3], v0, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v[4:5], v0, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: workgroup_ids_gfx: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx: ; GFX9ARCH-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index a4bde5c9d821..0abe0f91dc0b 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -68,8 +68,6 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, ptr addrspace(1) %out @@ -151,8 +149,6 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -250,8 +246,6 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -347,8 +341,6 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -434,8 +426,6 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -517,8 +507,6 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -618,8 +606,6 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -711,8 +697,6 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index a3e7bf2caf77..48dc95312f59 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -80,8 +80,6 @@ define amdgpu_kernel void @mad_u16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0 ; GFX11-NEXT: global_store_b16 v3, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 8eb0a46cc8b1..85b4fd0602f1 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -968,8 +968,6 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mad_i64_i32_uniform: @@ -988,8 +986,6 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index f2815915b842..5bcdf4d0aaf1 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -99,8 +99,6 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_f32: @@ -147,8 +145,6 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -283,8 +279,6 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_2_use_f32: @@ -346,8 +340,6 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc ; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -440,8 +432,6 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: @@ -478,8 +468,6 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -582,8 +570,6 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: @@ -630,8 +616,6 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -726,8 +710,6 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: s_v_madak_f32: @@ -770,8 +752,6 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -869,8 +849,6 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: v_s_madak_f32: @@ -917,8 +895,6 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid @@ -988,8 +964,6 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0 ; GFX11-MAD-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: s_s_madak_f32: @@ -1021,8 +995,6 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 @@ -1120,8 +1092,6 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: @@ -1169,8 +1139,6 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -1278,8 +1246,6 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: @@ -1327,8 +1293,6 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -1464,8 +1428,6 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX11-MAD-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-MAD-NEXT: s_nop 0 -; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: @@ -1533,8 +1495,6 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FMA-NEXT: s_nop 0 -; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm bb: %tmp = icmp eq i32 %arg1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 92536c207851..6910004a9ef5 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -84,8 +84,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-NEXT: v_not_b32_e32 v1, v1 ; GFX11-NEXT: v_not_b32_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index e929da796de6..9b413f95abae 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -170,8 +170,6 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1024 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1040 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: long_store_chain: @@ -255,8 +253,6 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1024 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1040 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm store <4 x i32> zeroinitializer, ptr addrspace(1) %p %ptr1 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 05ef2698c1f7..ff39439a2db1 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -116,8 +116,6 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -194,8 +192,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b @@ -266,8 +262,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b @@ -377,8 +371,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i32> %a, %b %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b @@ -490,8 +482,6 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b @@ -725,8 +715,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -830,8 +818,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle <2 x i16> %a, %b %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b @@ -1013,8 +999,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 ; GFX11-NEXT: v_pk_min_i16 v0, s4, s6 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1132,8 +1116,6 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -1273,8 +1255,6 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i16 v1, v1, v2 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid @@ -1352,8 +1332,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b @@ -1441,8 +1419,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b @@ -1520,8 +1496,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp slt i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 @@ -1599,8 +1573,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 @@ -1718,8 +1690,6 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -1859,8 +1829,6 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: v_min_u32_e32 v1, v1, v4 ; GFX11-NEXT: v_min_u32_e32 v0, v0, v3 ; GFX11-NEXT: global_store_b96 v6, v[0:2], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid @@ -2039,8 +2007,6 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v4, v1, s[4:5] offset:4 ; GFX11-NEXT: global_store_b32 v4, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid @@ -2118,8 +2084,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b @@ -2237,8 +2201,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -2363,8 +2325,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u16 v1, v1, v2 ; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid @@ -2442,8 +2402,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b @@ -2581,8 +2539,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 @@ -2725,8 +2681,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %aptr, align 2 %b = load i16, ptr addrspace(1) %bptr, align 2 @@ -2800,8 +2754,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp ult <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b @@ -2974,8 +2926,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp ult <8 x i32> %a, %b %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b @@ -3258,8 +3208,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-NEXT: v_pk_min_u16 v1, s5, s9 ; GFX11-NEXT: v_pk_min_u16 v0, s4, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp ult <8 x i16> %a, %b %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b @@ -3364,8 +3312,6 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 @@ -3473,8 +3419,6 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 @@ -3585,8 +3529,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b @@ -3697,8 +3639,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = icmp ult i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -3807,8 +3747,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = icmp ule i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -3917,8 +3855,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = icmp slt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -4027,8 +3963,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = icmp sle i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -4174,8 +4108,6 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid @@ -4325,8 +4257,6 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll index 062955d72206..c375b16ee380 100644 --- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll @@ -22,8 +22,6 @@ define amdgpu_ps void @s_test_minmax_f32(float inreg %a, float inreg %b, float i ; SDAG-NEXT: s_minimum_f32 s0, s0, s2 ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_f32: @@ -36,8 +34,6 @@ define amdgpu_ps void @s_test_minmax_f32(float inreg %a, float inreg %b, float i ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call float @llvm.maximum.f32(float %a, float %b) %sminmax = call float @llvm.minimum.f32(float %smax, float %c) @@ -115,8 +111,6 @@ define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inre ; SDAG-NEXT: s_minimum_f16 s0, s0, s2 ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: global_store_b16 v0, v1, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_f16: @@ -129,8 +123,6 @@ define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inre ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: global_store_b16 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call half @llvm.maximum.f16(half %a, half %b) %sminmax = call half @llvm.minimum.f16(half %smax, half %c) diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index b5b9997f297c..73f3d4c037ad 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -22,8 +22,6 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg % ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_i32: @@ -34,8 +32,6 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg % ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c) @@ -111,8 +107,6 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg % ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_u32: @@ -123,8 +117,6 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg % ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c) @@ -209,8 +201,6 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0 ; SDAG-NEXT: global_store_b32 v1, v0, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_f32_ieee_false: @@ -220,8 +210,6 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call float @llvm.maxnum.f32(float %a, float %b) %sminmax = call float @llvm.minnum.f32(float %smax, float %c) @@ -303,8 +291,6 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0 ; SDAG-NEXT: global_store_b16 v1, v0, s[4:5] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_f16_ieee_false: @@ -314,8 +300,6 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0 ; GISEL-NEXT: global_store_b16 v1, v0, s[6:7] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call half @llvm.maxnum.f16(half %a, half %b) %sminmax = call half @llvm.minnum.f16(half %smax, half %c) diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 30a40e6af853..03de142a41b4 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -103,8 +103,6 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: test_mul_v2i32: @@ -124,8 +122,6 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: test_mul_v2i32: @@ -265,8 +261,6 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_v4i32: @@ -290,8 +284,6 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4 ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul_v4i32: @@ -394,8 +386,6 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_trunc_i64_mul_to_i32: @@ -409,8 +399,6 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_trunc_i64_mul_to_i32: @@ -541,8 +529,6 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_trunc_i64_mul_to_i32: @@ -566,8 +552,6 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_trunc_i64_mul_to_i32: @@ -670,8 +654,6 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mul64_sext_c: @@ -685,8 +667,6 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: mul64_sext_c: @@ -778,8 +758,6 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mul64_zext_c: @@ -792,8 +770,6 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: mul64_zext_c: @@ -911,8 +887,6 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul64_sext_c: @@ -932,8 +906,6 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul64_sext_c: @@ -1058,8 +1030,6 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0 ; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul64_zext_c: @@ -1079,8 +1049,6 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul64_zext_c: @@ -1202,8 +1170,6 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul64_sext_inline_imm: @@ -1223,8 +1189,6 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 ; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul64_sext_inline_imm: @@ -1320,8 +1284,6 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i32: @@ -1337,8 +1299,6 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_mul_i32: @@ -1446,8 +1406,6 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i32: @@ -1466,8 +1424,6 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i32: @@ -1567,8 +1523,6 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i1: @@ -1585,8 +1539,6 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_mul_i1: @@ -1724,8 +1676,6 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i1: @@ -1748,8 +1698,6 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i1: @@ -1894,8 +1842,6 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i64: @@ -1909,8 +1855,6 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_mul_i64: @@ -2070,8 +2014,6 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i64: @@ -2101,8 +2043,6 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i64: @@ -2310,8 +2250,6 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mul32_in_branch: @@ -2347,8 +2285,6 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: mul32_in_branch: @@ -2574,8 +2510,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mul64_in_branch: @@ -2604,8 +2538,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: mul64_in_branch: @@ -2909,8 +2841,6 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i128: @@ -2954,8 +2884,6 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_mul_i128: @@ -3202,8 +3130,6 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo ; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: @@ -3243,8 +3169,6 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 548c196df1da..3a9cf9678d84 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -1884,8 +1884,6 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_1: @@ -1896,8 +1894,6 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 1 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -1934,8 +1930,6 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_11bit_max: @@ -1946,8 +1940,6 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -1984,8 +1976,6 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_12bit_max: @@ -1996,8 +1986,6 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2034,8 +2022,6 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_13bit_max: @@ -2046,8 +2032,6 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2084,8 +2068,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: @@ -2096,8 +2078,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2137,8 +2117,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: @@ -2149,8 +2127,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: @@ -2206,8 +2182,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: @@ -2218,8 +2192,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: @@ -2243,8 +2215,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2281,8 +2251,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: @@ -2293,8 +2261,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2331,8 +2297,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: @@ -2343,8 +2307,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2381,8 +2343,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: @@ -2393,8 +2353,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2434,8 +2392,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: @@ -2446,8 +2402,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: @@ -2503,8 +2457,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: @@ -2515,8 +2467,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: @@ -2540,8 +2490,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2586,8 +2534,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: @@ -2598,8 +2544,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: @@ -2623,8 +2567,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2670,8 +2612,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: @@ -2685,8 +2625,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: @@ -2710,8 +2648,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: @@ -2724,8 +2660,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2771,8 +2705,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: @@ -2786,8 +2718,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: @@ -2811,8 +2741,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: @@ -2825,8 +2753,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2872,8 +2798,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: @@ -2887,8 +2811,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: @@ -2912,8 +2834,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: @@ -2926,8 +2846,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -2973,8 +2891,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: @@ -2988,8 +2904,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: @@ -3013,8 +2927,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: @@ -3027,8 +2939,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3074,8 +2984,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: @@ -3089,8 +2997,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: @@ -3114,8 +3020,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: @@ -3128,8 +3032,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3175,8 +3077,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: @@ -3190,8 +3090,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: @@ -3215,8 +3113,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: @@ -3229,8 +3125,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3274,8 +3168,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: @@ -3288,8 +3180,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: @@ -3303,8 +3193,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3348,8 +3236,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: @@ -3362,8 +3248,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: @@ -3377,8 +3261,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3422,8 +3304,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: @@ -3436,8 +3316,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: @@ -3451,8 +3329,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3496,8 +3372,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: @@ -3510,8 +3384,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: @@ -3525,8 +3397,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3570,8 +3440,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: @@ -3584,8 +3452,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: @@ -3599,8 +3465,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 %load = load volatile i8, ptr addrspace(1) %gep, align 1 @@ -3644,8 +3508,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: @@ -3658,8 +3520,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: @@ -3673,8 +3533,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 %load = load volatile i8, ptr addrspace(1) %gep, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index df15f98ae27f..dd0b96fbb495 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -53,8 +53,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: @@ -69,8 +67,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -131,8 +127,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: @@ -147,8 +141,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -209,8 +201,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz: @@ -225,8 +215,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -287,8 +275,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz: @@ -303,8 +289,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid @@ -340,8 +324,6 @@ define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 @@ -373,8 +355,6 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f64_signed_zeros: @@ -383,8 +363,6 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd double %a, 1.0 %div2 = fmul double %add, 0.5 @@ -411,8 +389,6 @@ define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 @@ -439,16 +415,12 @@ define amdgpu_ps void @v_omod_div2_f64(double %a) #5 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 div:2 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 0.5 @@ -475,8 +447,6 @@ define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 2.0 @@ -503,16 +473,12 @@ define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_mul2_med3: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_med3_num_f32 v0, v0, v1, v2 mul:2 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z) %div2 = fmul float %fmed3, 2.0 @@ -539,16 +505,12 @@ define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_mul2_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:2 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 2.0 @@ -575,8 +537,6 @@ define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 4.0 @@ -603,16 +563,12 @@ define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_mul4_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:4 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 %div2 = fmul nsz double %add, 4.0 @@ -650,8 +606,6 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { ; GFX11-NEXT: global_store_b32 v[0:1], v1, off ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_mul4_multi_use_f32: @@ -663,8 +617,6 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 4.0 @@ -692,8 +644,6 @@ define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 @@ -722,8 +672,6 @@ define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 @@ -758,8 +706,6 @@ define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %max = call float @llvm.maxnum.f32(float %add, float 0.0) @@ -792,8 +738,6 @@ define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %abs.add = call float @llvm.fabs.f32(float %add) @@ -821,8 +765,6 @@ define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, v0 clamp ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, %a %max = call float @llvm.maxnum.f32(float %add, float 0.0) @@ -854,8 +796,6 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_add_clamp_self_f32: @@ -864,8 +804,6 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %max = call float @llvm.maxnum.f32(float %a, float 0.0) %clamp = call float @llvm.minnum.f32(float %max, float 1.0) @@ -897,8 +835,6 @@ define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_add_f32_e64 v0, |v0|, |v0| ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) @@ -930,8 +866,6 @@ define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_add_f32_e64 v0, |v0|, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) @@ -963,8 +897,6 @@ define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, |v0| ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %x = fadd float %a, 1.0 %abs.x = call float @llvm.fabs.f32(float %x) @@ -997,8 +929,6 @@ define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2.0 = fmul float %add, 0.5 @@ -1031,8 +961,6 @@ define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %div2 = fmul float %add, 0.5 @@ -1064,8 +992,6 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_div2_f64_denormals: @@ -1074,8 +1000,6 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd double %a, 1.0 %div2 = fmul double %add, 0.5 @@ -1107,8 +1031,6 @@ define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd float %a, 1.0 %mul2 = fadd float %add, %add @@ -1140,8 +1062,6 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_omod_mul2_f64_denormals: @@ -1150,8 +1070,6 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %add = fadd double %a, 1.0 %mul2 = fadd double %add, %add @@ -1185,8 +1103,6 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_mul_f16_e32 v0, 0.5, v0 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd half %a, 1.0 %div2 = fmul half %add, 0.5 @@ -1220,8 +1136,6 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11PLUS-NEXT: v_add_f16_e32 v0, v0, v0 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd half %a, 1.0 %mul2 = fadd half %add, %add @@ -1251,8 +1165,6 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ; GFX11PLUS: ; %bb.0: ; GFX11PLUS-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %add = fadd half %a, 1.0 %div2 = fmul half %add, 0.5 @@ -1284,8 +1196,6 @@ define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { ; GFX11PLUS-NEXT: v_add_f32_e64 v1, v1, v0 mul:2 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off -; GFX11PLUS-NEXT: s_nop 0 -; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11PLUS-NEXT: s_endpgm %mul = fmul float %a, %a %add = fadd float %mul, %b diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir index e50ee11fef51..80cb39830f1a 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir @@ -30,11 +30,11 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: test - ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec ; CHECK-NEXT: S_NOP 0, debug-location !8 ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0, debug-location !8 ; CHECK-NEXT: S_ENDPGM 0, debug-location !8 - GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec S_ENDPGM 0, debug-location !8 ... diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir index 25804123eff1..a9cb169d1a09 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir @@ -24,6 +24,7 @@ define amdgpu_ps void @global_store_optnone() noinline optnone { ret void } define amdgpu_cs void @with_calls() { ret void } define fastcc void @with_tail_calls() { ret void } + define amdgpu_cs void @waveslot_limited() { ret void } ... --- @@ -36,13 +37,13 @@ body: | ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: tbuffer_store1 ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -55,13 +56,13 @@ body: | ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: tbuffer_store2 ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -72,9 +73,9 @@ body: | bb.0: ; CHECK-LABEL: name: flat_store ; CHECK: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -88,15 +89,15 @@ body: | ; OPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: global_store ; NOOPT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec ; NOOPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec S_WAITCNT_VSCNT undef $sgpr_null, 0 - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -109,13 +110,13 @@ body: | ; OPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: buffer_store_format ; NOOPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -128,11 +129,11 @@ body: | ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $vgpr0 = IMPLICIT_DEF renamable $vgpr1 = IMPLICIT_DEF DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -150,17 +151,17 @@ body: | ; OPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: global_store_dword ; NOOPT: liveins: $vgpr0, $sgpr0_sgpr1 ; NOOPT-NEXT: {{ $}} ; NOOPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec ; NOOPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -185,7 +186,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 bb.0: successors: %bb.1 @@ -201,7 +202,7 @@ body: | S_BRANCH %bb.2 bb.2: - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... @@ -231,7 +232,7 @@ body: | ; OPT-NEXT: bb.2: ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: multiple_basic_blocks2 ; NOOPT: bb.0: @@ -249,7 +250,7 @@ body: | ; NOOPT-NEXT: S_BRANCH %bb.2 ; NOOPT-NEXT: {{ $}} ; NOOPT-NEXT: bb.2: - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 bb.0: successors: %bb.2 @@ -265,7 +266,7 @@ body: | S_BRANCH %bb.2 bb.2: - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... @@ -303,7 +304,7 @@ body: | ; OPT-NEXT: bb.4: ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: multiple_basic_blocks3 ; NOOPT: bb.0: @@ -331,7 +332,7 @@ body: | ; NOOPT-NEXT: S_BRANCH %bb.4 ; NOOPT-NEXT: {{ $}} ; NOOPT-NEXT: bb.4: - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 bb.0: successors: %bb.2 @@ -357,7 +358,7 @@ body: | S_BRANCH %bb.4 bb.4: - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -380,7 +381,7 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 bb.0: successors: %bb.1 @@ -395,7 +396,7 @@ body: | S_BRANCH %bb.2 bb.2: - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -422,7 +423,7 @@ body: | ; OPT-NEXT: bb.2: ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: recursive_loop_vmem ; NOOPT: bb.0: @@ -441,7 +442,7 @@ body: | ; NOOPT-NEXT: S_BRANCH %bb.2 ; NOOPT-NEXT: {{ $}} ; NOOPT-NEXT: bb.2: - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 bb.0: successors: %bb.1 @@ -457,7 +458,7 @@ body: | S_BRANCH %bb.2 bb.2: - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -470,13 +471,13 @@ body: | ; OPT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7) ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: image_store ; NOOPT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7) - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7) - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -488,10 +489,10 @@ body: | ; CHECK-LABEL: name: scratch_store ; CHECK: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -504,13 +505,13 @@ body: | ; OPT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7) ; OPT-NEXT: S_NOP 0 ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 - ; OPT-NEXT: S_ENDPGM 0 + ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97 ; ; NOOPT-LABEL: name: buffer_atomic ; NOOPT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7) - ; NOOPT-NEXT: S_ENDPGM 0 + ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97 BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7) - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -521,9 +522,9 @@ body: | bb.0: ; CHECK-LABEL: name: flat_atomic ; CHECK: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... @@ -535,9 +536,9 @@ body: | bb.0: ; CHECK-LABEL: name: global_atomic ; CHECK: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -548,9 +549,9 @@ body: | bb.0: ; CHECK-LABEL: name: image_atomic ; CHECK: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7) - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7) - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- @@ -562,38 +563,59 @@ body: | ; CHECK-LABEL: name: global_store_optnone ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec S_WAITCNT_VSCNT undef $sgpr_null, 0 - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- +# Make sure we don't send DEALLOC_VGPRS after a call, since there might be +# scratch stores still in progress. name: with_calls frameInfo: hasCalls: true body: | bb.0: - ; Make sure we don't send DEALLOC_VGPRS after a call, since there might be - ; scratch stores still in progress. ; CHECK-LABEL: name: with_calls - ; CHECK-NOT: S_SENDMSG 3 - ; CHECK: S_ENDPGM 0 + ; CHECK: S_WAITCNT 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97 GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu - S_ENDPGM 0 + S_ENDPGM 0, implicit $vgpr97 ... --- +# Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the +# only valid action after DEALLOC_VGPRS is to terminate the wave. name: with_tail_calls frameInfo: hasCalls: true body: | bb.0: - ; Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the - ; only valid action after DEALLOC_VGPRS is to terminate the wave. ; CHECK-LABEL: name: with_tail_calls - ; CHECK-NOT: S_SENDMSG 3 - GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + ; CHECK: S_WAITCNT 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec + ; CHECK-NEXT: SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu ... + +--- +# Do not deallocate VGPRs if kernel uses a small enough number of VGPRs +# making it likely waveslot limited and not VGPR limited. For gfx11 that +# is 96 registers. +name: waveslot_limited +frameInfo: + hasCalls: true +body: | + bb.0: + ; CHECK-LABEL: name: waveslot_limited + ; CHECK: S_WAITCNT 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr96, 0, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr96, 0, 4, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index f692584ef92b..d1e785f8daa0 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -64,8 +64,6 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = shl i32 %x, %y @@ -151,8 +149,6 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s2 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = shl <2 x i32> %x, %y @@ -264,8 +260,6 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s3 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %0 = shl <4 x i32> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index a368aa1055b2..2188a9864faa 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -57,8 +57,6 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub i32 32, %y @@ -132,8 +130,6 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s6 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y @@ -221,8 +217,6 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <4 x i32> , %y diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 57c936d4689e..d8deb8109711 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -113,8 +113,6 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -220,8 +218,6 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 @@ -336,8 +332,6 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 @@ -451,8 +445,6 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -576,8 +568,6 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] ; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %aptr, align 4 %b = load i64, ptr addrspace(1) %bptr, align 4 @@ -716,8 +706,6 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll index dafd23af9d65..ca508eb40017 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll @@ -11,8 +11,6 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_olt: @@ -24,8 +22,6 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp olt float %a, %b @@ -43,8 +39,6 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_oeq: @@ -56,8 +50,6 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oeq float %a, %b @@ -75,8 +67,6 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ole: @@ -88,8 +78,6 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ole float %a, %b @@ -107,8 +95,6 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ogt: @@ -120,8 +106,6 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ogt float %a, %b @@ -139,8 +123,6 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_one: @@ -152,8 +134,6 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp one float %a, %b @@ -171,8 +151,6 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_oge: @@ -184,8 +162,6 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oge float %a, %b @@ -203,8 +179,6 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ord: @@ -216,8 +190,6 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ord float %a, %b @@ -235,8 +207,6 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_uno: @@ -248,8 +218,6 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uno float %a, %b @@ -267,8 +235,6 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ult: @@ -280,8 +246,6 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ult float %a, %b @@ -299,8 +263,6 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ueq: @@ -312,8 +274,6 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ueq float %a, %b @@ -331,8 +291,6 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ule: @@ -344,8 +302,6 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ule float %a, %b @@ -363,8 +319,6 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_ugt: @@ -376,8 +330,6 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ugt float %a, %b @@ -395,8 +347,6 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_une: @@ -408,8 +358,6 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp une float %a, %b @@ -427,8 +375,6 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f32_uge: @@ -440,8 +386,6 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uge float %a, %b @@ -459,8 +403,6 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_olt: @@ -472,8 +414,6 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp olt half %a, %b @@ -491,8 +431,6 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_oeq: @@ -504,8 +442,6 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oeq half %a, %b @@ -523,8 +459,6 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ole: @@ -536,8 +470,6 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ole half %a, %b @@ -555,8 +487,6 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ogt: @@ -568,8 +498,6 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ogt half %a, %b @@ -587,8 +515,6 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_one: @@ -600,8 +526,6 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp one half %a, %b @@ -619,8 +543,6 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_oge: @@ -632,8 +554,6 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp oge half %a, %b @@ -651,8 +571,6 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ord: @@ -664,8 +582,6 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ord half %a, %b @@ -683,8 +599,6 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_uno: @@ -696,8 +610,6 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uno half %a, %b @@ -715,8 +627,6 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ult: @@ -728,8 +638,6 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ult half %a, %b @@ -747,8 +655,6 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ueq: @@ -760,8 +666,6 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ueq half %a, %b @@ -779,8 +683,6 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ule: @@ -792,8 +694,6 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ule half %a, %b @@ -811,8 +711,6 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_ugt: @@ -824,8 +722,6 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp ugt half %a, %b @@ -843,8 +739,6 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_une: @@ -856,8 +750,6 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp une half %a, %b @@ -875,8 +767,6 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: f16_uge: @@ -888,8 +778,6 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half ; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm entry: %0 = fcmp uge half %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index cc109595d8d7..84ca5dd0c186 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -116,8 +116,6 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -232,8 +230,6 @@ define amdgpu_kernel void @select_f16_imm_a( ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -345,8 +341,6 @@ define amdgpu_kernel void @select_f16_imm_b( ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -459,8 +453,6 @@ define amdgpu_kernel void @select_f16_imm_c( ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -573,8 +565,6 @@ define amdgpu_kernel void @select_f16_imm_d( ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -732,8 +722,6 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -875,8 +863,6 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -1016,8 +1002,6 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1159,8 +1143,6 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1302,8 +1284,6 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 9b9f03ff74aa..52dbd31b2c64 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -79,8 +79,6 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, ptr addrspace(1) %out @@ -162,8 +160,6 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -261,8 +257,6 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, s0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -358,8 +352,6 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -446,8 +438,6 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -530,8 +520,6 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -631,8 +619,6 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -730,8 +716,6 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 88e2bb772a2d..6ac04d8bc42b 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -115,8 +115,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -272,8 +270,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -390,8 +386,6 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -527,8 +521,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: @@ -542,8 +534,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -657,8 +647,6 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -794,8 +782,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: @@ -809,8 +795,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -924,8 +908,6 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1061,8 +1043,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: @@ -1076,8 +1056,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1191,8 +1169,6 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1361,8 +1337,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1488,8 +1462,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1646,8 +1618,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1779,8 +1749,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1922,8 +1890,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2065,8 +2031,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2192,8 +2156,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2317,8 +2279,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2454,8 +2414,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2591,8 +2549,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x44000000 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2722,8 +2678,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2846,8 +2800,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2972,8 +2924,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3103,8 +3053,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3227,8 +3175,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3353,8 +3299,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3507,8 +3451,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -3522,8 +3464,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3676,8 +3616,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -3691,8 +3629,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3821,8 +3757,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -3951,8 +3885,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -4074,8 +4006,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -4217,8 +4147,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: @@ -4232,8 +4160,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index 3b35b2d3d986..ab00d7d33bb9 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -76,8 +76,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp i64 %in to half store half %result, ptr addrspace(1) %out @@ -173,8 +171,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -252,8 +248,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp i64 %in to float store float %result, ptr addrspace(1) %out @@ -346,8 +340,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -460,8 +452,6 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out @@ -671,8 +661,6 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 ; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid @@ -797,8 +785,6 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x half> store <2 x half> %result, ptr addrspace(1) %out @@ -1031,8 +1017,6 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 9169433cdca5..8d34c7bb14ed 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -58,8 +58,6 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: sitofp_i16_to_f16: @@ -78,8 +76,6 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -147,8 +143,6 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: sitofp_i32_to_f16: @@ -169,8 +163,6 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -252,8 +244,6 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16: @@ -276,8 +266,6 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -359,8 +347,6 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16: @@ -385,8 +371,6 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -480,8 +464,6 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16: @@ -512,8 +494,6 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 8e0a83671a18..fdcb6c941e16 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -688,8 +688,6 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { ; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %exit ; GFX11-NEXT: global_store_b32 v[0:1], v9, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB8_2: ; %bb ; GFX11-NEXT: ;;#ASMSTART @@ -721,8 +719,6 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { ; GFX11-NEXT: v_mov_b32_e64 v9, -2 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: global_store_b32 v[0:1], v9, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB8_4: ; GFX11-NEXT: s_mov_b64 exec, 0 @@ -1105,8 +1101,6 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: v_mov_b32_e32 v0, 8 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB10_4: ; GFX11-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index df12e32d971a..85dd5581f287 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -49,8 +49,6 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %result = sub i32 %a, %b store i32 %result, ptr addrspace(1) %out @@ -101,8 +99,6 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %result = sub i32 1234, %a store i32 %result, ptr addrspace(1) %out @@ -162,8 +158,6 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in @@ -226,8 +220,6 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in %result = sub i32 123, %a @@ -292,8 +284,6 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2 ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in @@ -377,8 +367,6 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 ; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in @@ -458,8 +446,6 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0 ; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid @@ -539,8 +525,6 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid @@ -632,8 +616,6 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2 ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid @@ -697,8 +679,6 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 @@ -783,8 +763,6 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid @@ -882,8 +860,6 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid @@ -1022,8 +998,6 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 5a821db6ff04..18a94f75b6d9 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -81,8 +81,6 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -164,8 +162,6 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %b = load <2 x i16>, ptr addrspace(4) %in1 @@ -203,8 +199,6 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %add = sub <2 x i16> %a, %a @@ -268,8 +262,6 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = sub <2 x i16> %a, %b store <2 x i16> %add, ptr addrspace(1) %out @@ -337,8 +329,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -411,8 +401,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -483,8 +471,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -554,8 +540,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -626,8 +610,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -721,8 +703,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -826,8 +806,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid @@ -926,8 +904,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -1036,8 +1012,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 0daa68600726..b8f0d7617167 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -186,8 +186,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v1, 3 ; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-NEXT: s_nop 0 -; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; HSA-TRAP-GFX1100-NEXT: s_endpgm ; HSA-TRAP-GFX1100-NEXT: .LBB1_2: ; %trap ; HSA-TRAP-GFX1100-NEXT: s_trap 2 @@ -331,8 +329,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-NEXT: ; %bb.1: ; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-NEXT: s_nop 0 -; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; HSA-TRAP-GFX1100-NEXT: s_endpgm ; HSA-TRAP-GFX1100-NEXT: .LBB2_2: ; HSA-TRAP-GFX1100-NEXT: s_trap 2 @@ -465,8 +461,6 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX1100-NEXT: s_trap 3 ; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; HSA-TRAP-GFX1100-NEXT: s_nop 0 -; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; HSA-TRAP-GFX1100-NEXT: s_endpgm ; ; HSA-TRAP-GFX1100-O0-LABEL: debugtrap: diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 3d0fc4e6281a..764f99fb6833 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -64,8 +64,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to half store half %result, ptr addrspace(1) %out @@ -147,8 +145,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -214,8 +210,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to float store float %result, ptr addrspace(1) %out @@ -294,8 +288,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -384,8 +376,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out @@ -546,8 +536,6 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 ; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid @@ -648,8 +636,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x half> store <2 x half> %result, ptr addrspace(1) %out @@ -833,8 +819,6 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index c4268c15d9db..686dba7e53e6 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -58,8 +58,6 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: uitofp_i16_to_f16: @@ -78,8 +76,6 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -147,8 +143,6 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: uitofp_i32_to_f16: @@ -169,8 +163,6 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -252,8 +244,6 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16: @@ -276,8 +266,6 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -359,8 +347,6 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16: @@ -385,8 +371,6 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -480,8 +464,6 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-TRUE16-NEXT: s_nop 0 -; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: @@ -512,8 +494,6 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FAKE16-NEXT: s_nop 0 -; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index d1bf5ecb5698..e40f1e89afd2 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -89,8 +89,6 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx @@ -156,8 +154,6 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f @@ -223,8 +219,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -290,8 +284,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -357,8 +349,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -424,8 +414,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -509,8 +497,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -596,8 +582,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -678,8 +662,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -770,8 +752,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -864,8 +844,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -962,8 +940,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1073,8 +1049,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1184,8 +1158,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1297,8 +1269,6 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1406,8 +1376,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1512,8 +1480,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1616,8 +1582,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1711,8 +1675,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1822,8 +1784,6 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1931,8 +1891,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx @@ -2024,8 +1982,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx @@ -2131,8 +2087,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 472a443cf6dd..691009e9c58d 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -75,8 +75,6 @@ define amdgpu_kernel void @madak_f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -201,8 +199,6 @@ define amdgpu_kernel void @madak_f16_use_2( ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index ee99fcc58633..3cfc3dcd0efd 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -126,8 +126,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SDAG-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: @@ -193,8 +191,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm %src0 = trunc i32 %src0ext to i16 %src1 = trunc i32 %src1ext to i16 @@ -451,8 +447,6 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; SDAG-GFX11-NEXT: s_nop 0 -; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: @@ -526,8 +520,6 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GISEL-GFX11-NEXT: s_nop 0 -; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> ) diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 2797c5b79888..bdde260ff8bd 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1639,8 +1639,6 @@ define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1751,8 +1749,6 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> @@ -4496,8 +4492,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll index 4efa1e9353ab..698be7b2d1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -47,8 +47,6 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" { ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %tid = load volatile i32, ptr addrspace(1) undef %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid @@ -218,8 +216,6 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm .entry: %tid = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll index 96fa2a45a2dd..8f2ade7ac1a0 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll +++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %r ; GFX12-LABEL: intrinsic_store_system_scope: ; GFX12: ; %bb.0: ; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24) ret void @@ -18,8 +16,6 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) { ; GFX12: ; %bb.0: ; GFX12-NEXT: global_store_b32 v[1:2], v0, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm store volatile i32 %val, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 7a1f05f56a75..1e737680313c 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -46,8 +46,6 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; GFX11-NEXT: s_or_b32 s0, s0, 4 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %add = add i16 %load, 999 @@ -103,8 +101,6 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = zext i16 %load to i32 @@ -161,8 +157,6 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = sext i16 %load to i32 @@ -234,8 +228,6 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_d16_hi_b8 v[2:3], v5, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i17, ptr addrspace(4) %arg, align 4 %add = add i17 %load, 34 @@ -283,8 +275,6 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load half, ptr addrspace(4) %arg, align 4 %add = fadd half %load, 4.0 @@ -353,8 +343,6 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %arg, align 4 %add = add <2 x i8> %load, @@ -415,8 +403,6 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, 4, v2 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 @@ -468,8 +454,6 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %and = and i1 %load, true @@ -524,8 +508,6 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %zext = zext i16 %load to i32 @@ -583,8 +565,6 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %zext = zext i1 %load to i64 @@ -638,8 +618,6 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; GFX11-NEXT: s_or_b32 s0, s0, 4 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(6) %arg, align 4 %add = add i16 %load, 999 @@ -691,8 +669,6 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; GFX11-NEXT: s_or_b32 s0, s0, 1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0 %add = add i16 %load, 999 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll index cb3d76cd9c0b..48ab512a7ae0 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -42,8 +38,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -59,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -76,8 +68,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -93,8 +83,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -138,8 +122,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x half> %C @@ -153,8 +135,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) @@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -187,8 +165,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -204,8 +180,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -221,8 +195,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -238,8 +210,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -272,8 +240,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <8 x float> %C @@ -289,8 +255,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -306,8 +270,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -323,8 +285,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B @@ -338,8 +298,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -353,8 +311,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <16 x half> %B @@ -372,8 +328,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) @@ -388,8 +342,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) @@ -408,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %el3 = extractelement <8 x float> %C, i32 3 @@ -430,8 +380,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <8 x half> %A @@ -445,8 +393,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -472,8 +418,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %C = load <16 x half>, ptr %Caddr diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll index a3973970d50d..c208290fb352 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> ) @@ -30,8 +28,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> ) @@ -46,8 +42,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> ) @@ -68,8 +62,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> ) @@ -82,8 +74,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) @@ -100,8 +90,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12-NEXT: v_mov_b32_e32 v13, v10 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) @@ -118,8 +106,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> ; GFX12-NEXT: v_mov_b32_e32 v13, v10 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) @@ -136,8 +122,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12-NEXT: v_mov_b32_e32 v13, v10 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) @@ -152,8 +136,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -174,8 +156,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -190,8 +170,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) @@ -212,8 +190,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) @@ -228,8 +204,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -250,8 +224,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -266,8 +238,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -288,8 +258,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -304,8 +272,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -326,8 +292,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -342,8 +306,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -364,8 +326,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) @@ -380,8 +340,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) @@ -402,8 +360,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll index dbb4db05a35c..d99ed8a42ee1 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -40,8 +36,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) @@ -74,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0) @@ -90,8 +80,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1) @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -124,8 +110,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -140,8 +124,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -160,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -176,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -192,8 +170,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) @@ -210,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -226,8 +200,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -242,8 +214,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1) @@ -260,8 +230,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -276,8 +244,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -292,8 +258,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll index 009288dbdf53..d10dfcaeba7c 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll @@ -19,8 +19,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -51,8 +49,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -77,8 +73,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -103,8 +97,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -135,8 +127,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -167,8 +157,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -199,8 +187,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -231,8 +217,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -263,8 +247,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -295,8 +277,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll index 1012287838f1..6174841eb27c 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C) @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, < ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C) @@ -38,8 +34,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0) @@ -52,8 +46,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0) @@ -68,8 +60,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -84,8 +74,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0) @@ -100,8 +88,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -116,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -132,8 +116,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -148,8 +130,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C) @@ -164,8 +144,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -181,8 +159,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index) @@ -197,8 +173,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index) @@ -211,8 +185,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index) @@ -225,8 +197,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index) @@ -241,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -257,8 +225,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0) @@ -273,8 +239,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0) @@ -289,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -305,8 +267,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -321,8 +281,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) @@ -337,8 +295,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index 645f4981ba31..8f7cd5cb2bb6 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -21,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -36,8 +32,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -51,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -66,8 +58,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -81,8 +71,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -96,8 +84,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -111,8 +97,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -126,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x half> %C @@ -141,8 +123,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) @@ -156,8 +136,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -171,8 +149,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -186,8 +162,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -201,8 +175,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -216,8 +188,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -231,8 +201,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -246,8 +214,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.C = fneg <4 x float> %C @@ -261,8 +227,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -276,8 +240,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -291,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -306,8 +266,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -321,8 +279,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B @@ -338,8 +294,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) @@ -354,8 +308,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) @@ -372,8 +324,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %el3 = extractelement <4 x float> %C, i32 3 @@ -392,8 +342,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A @@ -407,8 +355,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %fneg.B = fneg <4 x half> %B @@ -429,8 +375,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %C = load <8 x half>, ptr %Caddr diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll index ad1d66e448b2..5b01b174144d 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> ) @@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-NEXT: v_mov_b32_e32 v9, v6 ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> ) @@ -39,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> ) @@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-NEXT: v_mov_b32_e32 v9, v6 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9] ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> ) @@ -72,8 +64,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) @@ -89,8 +79,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-NEXT: v_mov_b32_e32 v7, v6 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> , i1 0) @@ -106,8 +94,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> ; GFX12-NEXT: v_mov_b32_e32 v7, v6 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) @@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-NEXT: v_mov_b32_e32 v7, v6 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> , i1 0) @@ -137,8 +121,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrsp ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -156,8 +138,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -189,8 +167,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -203,8 +179,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -222,8 +196,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -236,8 +208,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -269,8 +237,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -288,8 +254,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -302,8 +266,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x f ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -321,8 +283,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> ) @@ -335,8 +295,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) @@ -354,8 +312,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7] ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> , i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll index c52b079aec2f..616fa3927737 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -50,8 +44,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -64,8 +56,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -78,8 +68,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -94,8 +82,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -108,8 +94,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) @@ -122,8 +106,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) @@ -136,8 +118,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -150,8 +130,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -164,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) @@ -180,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -194,8 +168,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -208,8 +180,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) @@ -224,8 +194,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -238,8 +206,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -252,8 +218,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll index ebbb3d1fe7de..311e76b9bb2b 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -27,8 +27,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -73,8 +71,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -113,8 +109,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -153,8 +147,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -199,8 +191,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -233,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -261,8 +249,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -301,8 +287,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -347,8 +331,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -393,8 +375,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 @@ -439,8 +419,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll index 0f8df5a6e3d2..901405cbd0a1 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C) @@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, < ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C) @@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0) @@ -48,8 +42,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0) @@ -62,8 +54,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -76,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -90,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) @@ -104,8 +90,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) @@ -118,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) @@ -132,8 +114,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C) @@ -146,8 +126,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) @@ -160,8 +138,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index) @@ -174,8 +150,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index) @@ -188,8 +162,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index) @@ -202,8 +174,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index) @@ -216,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) @@ -230,8 +198,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) @@ -244,8 +210,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) @@ -258,8 +222,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -272,8 +234,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -286,8 +246,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) @@ -300,8 +258,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index) diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll index ac218a2b19c6..b7b6028c86dc 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -31,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) @@ -55,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) @@ -79,8 +75,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) @@ -101,8 +95,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) @@ -125,8 +117,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) @@ -147,8 +137,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) @@ -171,8 +159,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -193,8 +179,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -215,8 +199,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -237,8 +219,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) @@ -259,8 +239,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -281,8 +259,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -303,8 +279,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -325,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) @@ -349,8 +321,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -371,8 +341,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -393,8 +361,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -415,8 +381,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) @@ -438,8 +402,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -460,8 +422,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -482,8 +442,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) @@ -504,8 +462,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off -; W32-NEXT: s_nop 0 -; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll index 9ec2d8f33eda..524a25cbc1e6 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -27,8 +27,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) @@ -47,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) @@ -67,8 +63,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) @@ -85,8 +79,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) @@ -105,8 +97,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) @@ -123,8 +113,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) @@ -143,8 +131,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -162,8 +148,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -180,8 +164,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -198,8 +180,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) @@ -216,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -234,8 +212,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -252,8 +228,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -270,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) @@ -290,8 +262,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -308,8 +278,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -326,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -344,8 +310,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) @@ -362,8 +326,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -380,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -398,8 +358,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) @@ -416,8 +374,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off -; W64-NEXT: s_nop 0 -; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index d31c9e7e03e7..c5a9ab31ca52 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -21,8 +21,6 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -53,8 +51,6 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -96,8 +92,6 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] ; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx -- GitLab From ac9ee618572537bcd77c58899aaab1d41dbad206 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Mon, 21 Oct 2024 09:49:58 -0700 Subject: [PATCH 259/511] [acc] Improve LegalizeDataValues pass to handle data constructs (#112990) Renames LegalizeData to LegalizeDataValues since this pass fixes up SSA values. LegalizeData suggested that it fixed data mapping. This change also adds support to fix up ssa values for data clause operations. Effectively, compute regions within a data region use the ssa values from data operations also. The ssa values within data regions but not within compute regions are not updated. This change is to support the requirement in the OpenACC spec which notes that a visible data clause is not just one on the current compute construct but on the lexically containing data construct or visible declare directive. --- flang/test/Fir/OpenACC/legalize-data.fir | 35 +++++++++- mlir/include/mlir/Dialect/OpenACC/OpenACC.h | 4 +- .../mlir/Dialect/OpenACC/Transforms/Passes.h | 7 +- .../mlir/Dialect/OpenACC/Transforms/Passes.td | 14 ++-- .../Dialect/OpenACC/Transforms/CMakeLists.txt | 2 +- ...egalizeData.cpp => LegalizeDataValues.cpp} | 65 +++++++++++++++---- mlir/test/Dialect/OpenACC/legalize-data.mlir | 30 ++++++++- 7 files changed, 128 insertions(+), 29 deletions(-) rename mlir/lib/Dialect/OpenACC/Transforms/{LegalizeData.cpp => LegalizeDataValues.cpp} (54%) diff --git a/flang/test/Fir/OpenACC/legalize-data.fir b/flang/test/Fir/OpenACC/legalize-data.fir index 3b8695434e6e..6bc81dc08db3 100644 --- a/flang/test/Fir/OpenACC/legalize-data.fir +++ b/flang/test/Fir/OpenACC/legalize-data.fir @@ -1,4 +1,4 @@ -// RUN: fir-opt -split-input-file --openacc-legalize-data %s | FileCheck %s +// RUN: fir-opt -split-input-file --openacc-legalize-data-values %s | FileCheck %s func.func @_QPsub1(%arg0: !fir.ref {fir.bindc_name = "i"}) { %0:2 = hlfir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -22,3 +22,36 @@ func.func @_QPsub1(%arg0: !fir.ref {fir.bindc_name = "i"}) { // CHECK: acc.yield // CHECK: } // CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref) to varPtr(%[[I]]#0 : !fir.ref) {dataClause = #acc, name = "i"} + +// ----- + +func.func @_QPsub1(%arg0: !fir.ref {fir.bindc_name = "i"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "_QFsub1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %1 = acc.copyin varPtr(%0#0 : !fir.ref) -> !fir.ref {dataClause = #acc, name = "i"} + acc.data dataOperands(%1 : !fir.ref) { + %c0_i32 = arith.constant 0 : i32 + hlfir.assign %c0_i32 to %0#0 : i32, !fir.ref + acc.serial { + hlfir.assign %c0_i32 to %0#0 : i32, !fir.ref + acc.yield + } + acc.terminator + } + acc.copyout accPtr(%1 : !fir.ref) to varPtr(%0#0 : !fir.ref) {dataClause = #acc, name = "i"} + return +} + +// CHECK-LABEL: func.func @_QPsub1 +// CHECK-SAME: (%[[ARG0:.*]]: !fir.ref {fir.bindc_name = "i"}) +// CHECK: %[[I:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFsub1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[I]]#0 : !fir.ref) -> !fir.ref {dataClause = #acc, name = "i"} +// CHECK: acc.data dataOperands(%[[COPYIN]] : !fir.ref) { +// CHECK: %[[C0:.*]] = arith.constant 0 : i32 +// CHECK: hlfir.assign %[[C0]] to %0#0 : i32, !fir.ref +// CHECK: acc.serial { +// CHECK: hlfir.assign %[[C0]] to %[[COPYIN]] : i32, !fir.ref +// CHECK: acc.yield +// CHECK: } +// CHECK: acc.terminator +// CHECK: } +// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref) to varPtr(%[[I]]#0 : !fir.ref) {dataClause = #acc, name = "i"} diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index ca96ce62ae40..cda07d6a9136 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -56,14 +56,14 @@ mlir::acc::ParallelOp, mlir::acc::KernelsOp, mlir::acc::SerialOp #define ACC_COMPUTE_CONSTRUCT_AND_LOOP_OPS \ ACC_COMPUTE_CONSTRUCT_OPS, mlir::acc::LoopOp -#define OPENACC_DATA_CONSTRUCT_STRUCTURED_OPS \ +#define ACC_DATA_CONSTRUCT_STRUCTURED_OPS \ mlir::acc::DataOp, mlir::acc::DeclareOp #define ACC_DATA_CONSTRUCT_UNSTRUCTURED_OPS \ mlir::acc::EnterDataOp, mlir::acc::ExitDataOp, mlir::acc::UpdateOp, \ mlir::acc::HostDataOp, mlir::acc::DeclareEnterOp, \ mlir::acc::DeclareExitOp #define ACC_DATA_CONSTRUCT_OPS \ - OPENACC_DATA_CONSTRUCT_STRUCTURED_OPS, ACC_DATA_CONSTRUCT_UNSTRUCTURED_OPS + ACC_DATA_CONSTRUCT_STRUCTURED_OPS, ACC_DATA_CONSTRUCT_UNSTRUCTURED_OPS #define ACC_COMPUTE_AND_DATA_CONSTRUCT_OPS \ ACC_COMPUTE_CONSTRUCT_OPS, ACC_DATA_CONSTRUCT_OPS #define ACC_COMPUTE_LOOP_AND_DATA_CONSTRUCT_OPS \ diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h index bb93c78bf6ea..57d532b078b9 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h @@ -11,9 +11,6 @@ #include "mlir/Pass/Pass.h" -#define GEN_PASS_DECL -#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" - namespace mlir { namespace func { @@ -22,8 +19,8 @@ class FuncOp; namespace acc { -/// Create a pass to replace ssa values in region with device/host values. -std::unique_ptr> createLegalizeDataInRegion(); +#define GEN_PASS_DECL +#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" /// Generate the code for registering conversion passes. #define GEN_PASS_REGISTRATION diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td index abbc27765e34..9ceb91e5679a 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td @@ -11,18 +11,20 @@ include "mlir/Pass/PassBase.td" -def LegalizeDataInRegion : Pass<"openacc-legalize-data", "mlir::func::FuncOp"> { - let summary = "Legalize the data in the compute region"; +def LegalizeDataValuesInRegion : Pass<"openacc-legalize-data-values", "mlir::func::FuncOp"> { + let summary = "Legalizes SSA values in compute regions with results from data clause operations"; let description = [{ - This pass replace uses of varPtr in the compute region with their accPtr - gathered from the data clause operands. + This pass replace uses of the `varPtr` in compute regions (kernels, + parallel, serial) with the result of data clause operations (`accPtr`). }]; let options = [ Option<"hostToDevice", "host-to-device", "bool", "true", "Replace varPtr uses with accPtr if true. Replace accPtr uses with " - "varPtr if false"> + "varPtr if false">, + Option<"applyToAccDataConstruct", "apply-to-acc-data-construct", "bool", "true", + "Replaces varPtr uses with accPtr for acc compute regions contained " + "within acc.data or acc.declare region."> ]; - let constructor = "::mlir::acc::createLegalizeDataInRegion()"; } #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt index 41ba7f8f53d3..7d934956089a 100644 --- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_dialect_library(MLIROpenACCTransforms - LegalizeData.cpp + LegalizeDataValues.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC diff --git a/mlir/lib/Dialect/OpenACC/Transforms/LegalizeData.cpp b/mlir/lib/Dialect/OpenACC/Transforms/LegalizeDataValues.cpp similarity index 54% rename from mlir/lib/Dialect/OpenACC/Transforms/LegalizeData.cpp rename to mlir/lib/Dialect/OpenACC/Transforms/LegalizeDataValues.cpp index db6b472ff973..4038e333adb8 100644 --- a/mlir/lib/Dialect/OpenACC/Transforms/LegalizeData.cpp +++ b/mlir/lib/Dialect/OpenACC/Transforms/LegalizeDataValues.cpp @@ -1,4 +1,4 @@ -//===- LegalizeData.cpp - -------------------------------------------------===// +//===- LegalizeDataValues.cpp - -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,10 +12,11 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/Support/ErrorHandling.h" namespace mlir { namespace acc { -#define GEN_PASS_DEF_LEGALIZEDATAINREGION +#define GEN_PASS_DEF_LEGALIZEDATAVALUESINREGION #include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" } // namespace acc } // namespace mlir @@ -24,6 +25,17 @@ using namespace mlir; namespace { +static bool insideAccComputeRegion(mlir::Operation *op) { + mlir::Operation *parent{op->getParentOp()}; + while (parent) { + if (isa(parent)) { + return true; + } + parent = parent->getParentOp(); + } + return false; +} + static void collectPtrs(mlir::ValueRange operands, llvm::SmallVector> &values, bool hostToDevice) { @@ -39,6 +51,25 @@ static void collectPtrs(mlir::ValueRange operands, } } +template +static void replaceAllUsesInAccComputeRegionsWith(Value orig, Value replacement, + Region &outerRegion) { + for (auto &use : llvm::make_early_inc_range(orig.getUses())) { + if (outerRegion.isAncestor(use.getOwner()->getParentRegion())) { + if constexpr (std::is_same_v || + std::is_same_v) { + // For data construct regions, only replace uses in contained compute + // regions. + if (insideAccComputeRegion(use.getOwner())) { + use.set(replacement); + } + } else { + use.set(replacement); + } + } + } +} + template static void collectAndReplaceInRegion(Op &op, bool hostToDevice) { llvm::SmallVector> values; @@ -48,7 +79,9 @@ static void collectAndReplaceInRegion(Op &op, bool hostToDevice) { collectPtrs(op.getPrivateOperands(), values, hostToDevice); } else { collectPtrs(op.getDataClauseOperands(), values, hostToDevice); - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v && + !std::is_same_v && + !std::is_same_v) { collectPtrs(op.getReductionOperands(), values, hostToDevice); collectPtrs(op.getGangPrivateOperands(), values, hostToDevice); collectPtrs(op.getGangFirstPrivateOperands(), values, hostToDevice); @@ -56,18 +89,25 @@ static void collectAndReplaceInRegion(Op &op, bool hostToDevice) { } for (auto p : values) - replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p), op.getRegion()); + replaceAllUsesInAccComputeRegionsWith(std::get<0>(p), std::get<1>(p), + op.getRegion()); } -struct LegalizeDataInRegion - : public acc::impl::LegalizeDataInRegionBase { +class LegalizeDataValuesInRegion + : public acc::impl::LegalizeDataValuesInRegionBase< + LegalizeDataValuesInRegion> { +public: + using LegalizeDataValuesInRegionBase< + LegalizeDataValuesInRegion>::LegalizeDataValuesInRegionBase; void runOnOperation() override { func::FuncOp funcOp = getOperation(); bool replaceHostVsDevice = this->hostToDevice.getValue(); funcOp.walk([&](Operation *op) { - if (!isa(*op) && !isa(*op)) + if (!isa(*op) && + !(isa(*op) && + applyToAccDataConstruct)) return; if (auto parallelOp = dyn_cast(*op)) { @@ -78,14 +118,15 @@ struct LegalizeDataInRegion collectAndReplaceInRegion(kernelsOp, replaceHostVsDevice); } else if (auto loopOp = dyn_cast(*op)) { collectAndReplaceInRegion(loopOp, replaceHostVsDevice); + } else if (auto dataOp = dyn_cast(*op)) { + collectAndReplaceInRegion(dataOp, replaceHostVsDevice); + } else if (auto declareOp = dyn_cast(*op)) { + collectAndReplaceInRegion(declareOp, replaceHostVsDevice); + } else { + llvm_unreachable("unsupported acc region op"); } }); } }; } // end anonymous namespace - -std::unique_ptr> -mlir::acc::createLegalizeDataInRegion() { - return std::make_unique(); -} diff --git a/mlir/test/Dialect/OpenACC/legalize-data.mlir b/mlir/test/Dialect/OpenACC/legalize-data.mlir index 113fe90450ab..842f8e260c49 100644 --- a/mlir/test/Dialect/OpenACC/legalize-data.mlir +++ b/mlir/test/Dialect/OpenACC/legalize-data.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -split-input-file --openacc-legalize-data %s | FileCheck %s --check-prefixes=CHECK,DEVICE -// RUN: mlir-opt -split-input-file --openacc-legalize-data=host-to-device=false %s | FileCheck %s --check-prefixes=CHECK,HOST +// RUN: mlir-opt -split-input-file --openacc-legalize-data-values %s | FileCheck %s --check-prefixes=CHECK,DEVICE +// RUN: mlir-opt -split-input-file --openacc-legalize-data-values=host-to-device=false %s | FileCheck %s --check-prefixes=CHECK,HOST func.func @test(%a: memref<10xf32>, %i : index) { %create = acc.create varPtr(%a : memref<10xf32>) -> memref<10xf32> @@ -61,6 +61,32 @@ func.func @test(%a: memref<10xf32>, %i : index) { // ----- +func.func @test(%a: memref<10xf32>, %i : index) { + %create = acc.create varPtr(%a : memref<10xf32>) -> memref<10xf32> + acc.data dataOperands(%create : memref<10xf32>) { + %c0 = arith.constant 0.000000e+00 : f32 + memref.store %c0, %a[%i] : memref<10xf32> + acc.serial { + %cs = memref.load %a[%i] : memref<10xf32> + acc.yield + } + acc.terminator + } + return +} + +// CHECK-LABEL: func.func @test +// CHECK-SAME: (%[[A:.*]]: memref<10xf32>, %[[I:.*]]: index) +// CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : memref<10xf32>) -> memref<10xf32> +// CHECK: acc.data dataOperands(%[[CREATE]] : memref<10xf32>) { +// CHECK: memref.store %{{.*}}, %[[A]][%[[I]]] : memref<10xf32> +// DEVICE: %{{.*}} = memref.load %[[CREATE]][%[[I]]] : memref<10xf32> +// HOST: %{{.*}} = memref.load %[[A]][%[[I]]] : memref<10xf32> +// CHECK: acc.terminator +// CHECK: } + +// ----- + func.func @test(%a: memref<10xf32>) { %lb = arith.constant 0 : index %st = arith.constant 1 : index -- GitLab From dcbf2c2ca078367fcd84feae9a51226b9761117a Mon Sep 17 00:00:00 2001 From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:51:01 -0400 Subject: [PATCH 260/511] [Scalarizer][DirectX] support structs return types (#111569) Based on this RFC: https://discourse.llvm.org/t/rfc-allow-the-scalarizer-pass-to-scalarize-vectors-returned-in-structs/82306 LLVM intrinsics do not support out params. To get around this limitation implementers will make intrinsics return structs to capture a return type and an out param. This implementation detail should not impact scalarization since these cases should be elementwise operations. ## Three changes are needed. - The CallInst visitor needs to be updated to handle Structs - A new visitor is needed for `ExtractValue` instructions - finsh needs to be update to handle structs so that insert elements are properly propogated. ## Testing changes - Add support for `llvm.frexp` - Add support for `llvm.dx.splitdouble` fixes https://github.com/llvm/llvm-project/issues/111437 --- llvm/include/llvm/Analysis/VectorUtils.h | 5 + llvm/include/llvm/IR/IntrinsicsDirectX.td | 2 + llvm/lib/Analysis/VectorUtils.cpp | 10 ++ .../DirectX/DirectXTargetTransformInfo.cpp | 77 ++++++------- llvm/lib/Transforms/Scalar/Scalarizer.cpp | 104 +++++++++++++++++- llvm/test/CodeGen/DirectX/split-double.ll | 45 ++++++++ llvm/test/Transforms/Scalarizer/frexp.ll | 93 ++++++++++++++++ 7 files changed, 297 insertions(+), 39 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/split-double.ll create mode 100644 llvm/test/Transforms/Scalarizer/frexp.ll diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index e2dd4976f390..467d5932cacf 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -154,6 +154,11 @@ bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, /// the operand at index \p OpdIdx, or on the return type if \p OpdIdx is -1. bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx); +/// Identifies if the vector form of the intrinsic that returns a struct is +/// overloaded at the struct element index \p RetIdx. +bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, + int RetIdx); + /// Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns /// its intrinsic ID, in case it does not found it return not_intrinsic. diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 27a437a83be6..e30d37f69f78 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -89,5 +89,7 @@ def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrCon def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; +def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], + [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>; def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 6b5251e0ad34..37c443011719 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -152,6 +152,16 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, } } +bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, + int RetIdx) { + switch (ID) { + case Intrinsic::frexp: + return RetIdx == 0 || RetIdx == 1; + default: + return RetIdx == 0; + } +} + /// Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns /// its ID, in case it does not found it return not_intrinsic. diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 8ea31401121b..231afd8ae3ee 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -1,38 +1,39 @@ -//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ -//-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -//===----------------------------------------------------------------------===// - -#include "DirectXTargetTransformInfo.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IntrinsicsDirectX.h" - -using namespace llvm; - -bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, - unsigned ScalarOpdIdx) { - switch (ID) { - case Intrinsic::dx_wave_readlane: - return ScalarOpdIdx == 1; - default: - return false; - } -} - -bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( - Intrinsic::ID ID) const { - switch (ID) { - case Intrinsic::dx_frac: - case Intrinsic::dx_rsqrt: - case Intrinsic::dx_wave_readlane: - return true; - default: - return false; - } -} +//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +//===----------------------------------------------------------------------===// + +#include "DirectXTargetTransformInfo.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsDirectX.h" + +using namespace llvm; + +bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { + switch (ID) { + case Intrinsic::dx_wave_readlane: + return ScalarOpdIdx == 1; + default: + return false; + } +} + +bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( + Intrinsic::ID ID) const { + switch (ID) { + case Intrinsic::dx_frac: + case Intrinsic::dx_rsqrt: + case Intrinsic::dx_wave_readlane: + case Intrinsic::dx_splitdouble: + return true; + default: + return false; + } +} diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index b1e4c7e52d99..772f4c6c35dd 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -197,6 +197,24 @@ struct VectorLayout { uint64_t SplitSize = 0; }; +static bool isStructOfMatchingFixedVectors(Type *Ty) { + if (!isa(Ty)) + return false; + unsigned StructSize = Ty->getNumContainedTypes(); + if (StructSize < 1) + return false; + FixedVectorType *VecTy = dyn_cast(Ty->getContainedType(0)); + if (!VecTy) + return false; + unsigned VecSize = VecTy->getNumElements(); + for (unsigned I = 1; I < StructSize; I++) { + VecTy = dyn_cast(Ty->getContainedType(I)); + if (!VecTy || VecSize != VecTy->getNumElements()) + return false; + } + return true; +} + /// Concatenate the given fragments to a single vector value of the type /// described in @p VS. static Value *concatenate(IRBuilder<> &Builder, ArrayRef Fragments, @@ -276,6 +294,7 @@ public: bool visitBitCastInst(BitCastInst &BCI); bool visitInsertElementInst(InsertElementInst &IEI); bool visitExtractElementInst(ExtractElementInst &EEI); + bool visitExtractValueInst(ExtractValueInst &EVI); bool visitShuffleVectorInst(ShuffleVectorInst &SVI); bool visitPHINode(PHINode &PHI); bool visitLoadInst(LoadInst &LI); @@ -667,6 +686,12 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) { if (isTriviallyVectorizable(ID)) return true; + // TODO: Move frexp to isTriviallyVectorizable. + // https://github.com/llvm/llvm-project/issues/112408 + switch (ID) { + case Intrinsic::frexp: + return true; + } return Intrinsic::isTargetIntrinsic(ID) && TTI->isTargetIntrinsicTriviallyScalarizable(ID); } @@ -674,7 +699,13 @@ bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) { /// If a call to a vector typed intrinsic function, split into a scalar call per /// element if possible for the intrinsic. bool ScalarizerVisitor::splitCall(CallInst &CI) { - std::optional VS = getVectorSplit(CI.getType()); + Type *CallType = CI.getType(); + bool AreAllVectorsOfMatchingSize = isStructOfMatchingFixedVectors(CallType); + std::optional VS; + if (AreAllVectorsOfMatchingSize) + VS = getVectorSplit(CallType->getContainedType(0)); + else + VS = getVectorSplit(CallType); if (!VS) return false; @@ -699,6 +730,23 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { if (isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) Tys.push_back(VS->SplitTy); + if (AreAllVectorsOfMatchingSize) { + for (unsigned I = 1; I < CallType->getNumContainedTypes(); I++) { + std::optional CurrVS = + getVectorSplit(cast(CallType->getContainedType(I))); + // This case does not seem to happen, but it is possible for + // VectorSplit.NumPacked >= NumElems. If that happens a VectorSplit + // is not returned and we will bailout of handling this call. + // The secondary bailout case is if NumPacked does not match. + // This can happen if ScalarizeMinBits is not set to the default. + // This means with certain ScalarizeMinBits intrinsics like frexp + // will only scalarize when the struct elements have the same bitness. + if (!CurrVS || CurrVS->NumPacked != VS->NumPacked) + return false; + if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I)) + Tys.push_back(CurrVS->SplitTy); + } + } // Assumes that any vector type has the same number of elements as the return // vector type, which is true for all current intrinsics. for (unsigned I = 0; I != NumArgs; ++I) { @@ -1030,6 +1078,31 @@ bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { return true; } +bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) { + Value *Op = EVI.getOperand(0); + Type *OpTy = Op->getType(); + ValueVector Res; + if (!isStructOfMatchingFixedVectors(OpTy)) + return false; + Type *VecType = cast(OpTy->getContainedType(0)); + std::optional VS = getVectorSplit(VecType); + if (!VS) + return false; + IRBuilder<> Builder(&EVI); + Scatterer Op0 = scatter(&EVI, Op, *VS); + assert(!EVI.getIndices().empty() && "Make sure an index exists"); + // Note for our use case we only care about the top level index. + unsigned Index = EVI.getIndices()[0]; + for (unsigned OpIdx = 0; OpIdx < Op0.size(); ++OpIdx) { + Value *ResElem = Builder.CreateExtractValue( + Op0[OpIdx], Index, EVI.getName() + ".elem" + Twine(Index)); + Res.push_back(ResElem); + } + + gather(&EVI, Res, *VS); + return true; +} + bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { std::optional VS = getVectorSplit(EEI.getOperand(0)->getType()); if (!VS) @@ -1209,6 +1282,35 @@ bool ScalarizerVisitor::finish() { Res = concatenate(Builder, CV, VS, Op->getName()); Res->takeName(Op); + } else if (auto *Ty = dyn_cast(Op->getType())) { + BasicBlock *BB = Op->getParent(); + IRBuilder<> Builder(Op); + if (isa(Op)) + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + + // Iterate over each element in the struct + unsigned NumOfStructElements = Ty->getNumElements(); + SmallVector ElemCV(NumOfStructElements); + for (unsigned I = 0; I < NumOfStructElements; ++I) { + for (auto *CVelem : CV) { + Value *Elem = Builder.CreateExtractValue( + CVelem, I, Op->getName() + ".elem" + Twine(I)); + ElemCV[I].push_back(Elem); + } + } + Res = PoisonValue::get(Ty); + for (unsigned I = 0; I < NumOfStructElements; ++I) { + Type *ElemTy = Ty->getElementType(I); + assert(isa(ElemTy) && + "Only Structs of all FixedVectorType supported"); + VectorSplit VS = *getVectorSplit(ElemTy); + assert(VS.NumFragments == CV.size()); + + Value *ConcatenatedVector = + concatenate(Builder, ElemCV[I], VS, Op->getName()); + Res = Builder.CreateInsertValue(Res, ConcatenatedVector, I, + Op->getName() + ".insert"); + } } else { assert(CV.size() == 1 && Op->getType() == CV[0]->getType()); Res = CV[0]; diff --git a/llvm/test/CodeGen/DirectX/split-double.ll b/llvm/test/CodeGen/DirectX/split-double.ll new file mode 100644 index 000000000000..759590fa5627 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/split-double.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='function(scalarizer)' -S -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +define void @test_vector_double_split_void(<2 x double> noundef %d) { +; CHECK-LABEL: define void @test_vector_double_split_void( +; CHECK-SAME: <2 x double> noundef [[D:%.*]]) { +; CHECK-NEXT: [[D_I0:%.*]] = extractelement <2 x double> [[D]], i64 0 +; CHECK-NEXT: [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]]) +; CHECK-NEXT: [[D_I1:%.*]] = extractelement <2 x double> [[D]], i64 1 +; CHECK-NEXT: [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]]) +; CHECK-NEXT: ret void +; + %hlsl.asuint = call { <2 x i32>, <2 x i32> } @llvm.dx.splitdouble.v2i32(<2 x double> %d) + ret void +} + +define noundef <3 x i32> @test_vector_double_split(<3 x double> noundef %d) { +; CHECK-LABEL: define noundef <3 x i32> @test_vector_double_split( +; CHECK-SAME: <3 x double> noundef [[D:%.*]]) { +; CHECK-NEXT: [[D_I0:%.*]] = extractelement <3 x double> [[D]], i64 0 +; CHECK-NEXT: [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]]) +; CHECK-NEXT: [[D_I1:%.*]] = extractelement <3 x double> [[D]], i64 1 +; CHECK-NEXT: [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]]) +; CHECK-NEXT: [[D_I2:%.*]] = extractelement <3 x double> [[D]], i64 2 +; CHECK-NEXT: [[HLSL_ASUINT_I2:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I2]]) +; CHECK-NEXT: [[DOTELEM0:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 0 +; CHECK-NEXT: [[DOTELEM01:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 0 +; CHECK-NEXT: [[DOTELEM02:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 0 +; CHECK-NEXT: [[DOTELEM1:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 1 +; CHECK-NEXT: [[DOTELEM13:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 1 +; CHECK-NEXT: [[DOTELEM14:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 1 +; CHECK-NEXT: [[DOTI0:%.*]] = add i32 [[DOTELEM0]], [[DOTELEM1]] +; CHECK-NEXT: [[DOTI1:%.*]] = add i32 [[DOTELEM01]], [[DOTELEM13]] +; CHECK-NEXT: [[DOTI2:%.*]] = add i32 [[DOTELEM02]], [[DOTELEM14]] +; CHECK-NEXT: [[DOTUPTO015:%.*]] = insertelement <3 x i32> poison, i32 [[DOTI0]], i64 0 +; CHECK-NEXT: [[DOTUPTO116:%.*]] = insertelement <3 x i32> [[DOTUPTO015]], i32 [[DOTI1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> [[DOTUPTO116]], i32 [[DOTI2]], i64 2 +; CHECK-NEXT: ret <3 x i32> [[TMP1]] +; + %hlsl.asuint = call { <3 x i32>, <3 x i32> } @llvm.dx.splitdouble.v3i32(<3 x double> %d) + %1 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 0 + %2 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 1 + %3 = add <3 x i32> %1, %2 + ret <3 x i32> %3 +} diff --git a/llvm/test/Transforms/Scalarizer/frexp.ll b/llvm/test/Transforms/Scalarizer/frexp.ll new file mode 100644 index 000000000000..6397832f6648 --- /dev/null +++ b/llvm/test/Transforms/Scalarizer/frexp.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt %s -passes='function(scalarizer)' -S | FileCheck %s + +define void @test_vector_frexp_void(<2 x double> noundef %d) { +; CHECK-LABEL: define void @test_vector_frexp_void( +; CHECK-SAME: <2 x double> noundef [[D:%.*]]) { +; CHECK-NEXT: [[D_I0:%.*]] = extractelement <2 x double> [[D]], i64 0 +; CHECK-NEXT: [[DOTI0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[D_I0]]) +; CHECK-NEXT: [[D_I1:%.*]] = extractelement <2 x double> [[D]], i64 1 +; CHECK-NEXT: [[DOTI1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[D_I1]]) +; CHECK-NEXT: ret void +; + %1 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %d) + ret void +} + +define noundef <2 x half> @test_vector_half_frexp_half(<2 x half> noundef %h) { +; CHECK-LABEL: define noundef <2 x half> @test_vector_half_frexp_half( +; CHECK-SAME: <2 x half> noundef [[H:%.*]]) { +; CHECK-NEXT: [[H_I0:%.*]] = extractelement <2 x half> [[H]], i64 0 +; CHECK-NEXT: [[R_I0:%.*]] = call { half, i32 } @llvm.frexp.f16.i32(half [[H_I0]]) +; CHECK-NEXT: [[H_I1:%.*]] = extractelement <2 x half> [[H]], i64 1 +; CHECK-NEXT: [[R_I1:%.*]] = call { half, i32 } @llvm.frexp.f16.i32(half [[H_I1]]) +; CHECK-NEXT: [[E0_ELEM0:%.*]] = extractvalue { half, i32 } [[R_I0]], 0 +; CHECK-NEXT: [[E0_ELEM01:%.*]] = extractvalue { half, i32 } [[R_I1]], 0 +; CHECK-NEXT: [[E0_UPTO0:%.*]] = insertelement <2 x half> poison, half [[E0_ELEM0]], i64 0 +; CHECK-NEXT: [[E0:%.*]] = insertelement <2 x half> [[E0_UPTO0]], half [[E0_ELEM01]], i64 1 +; CHECK-NEXT: ret <2 x half> [[E0]] +; + %r = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x half> %h) + %e0 = extractvalue { <2 x half>, <2 x i32> } %r, 0 + ret <2 x half> %e0 +} + +define noundef <2 x i32> @test_vector_half_frexp_int(<2 x half> noundef %h) { +; CHECK-LABEL: define noundef <2 x i32> @test_vector_half_frexp_int( +; CHECK-SAME: <2 x half> noundef [[H:%.*]]) { +; CHECK-NEXT: [[H_I0:%.*]] = extractelement <2 x half> [[H]], i64 0 +; CHECK-NEXT: [[R_I0:%.*]] = call { half, i32 } @llvm.frexp.f16.i32(half [[H_I0]]) +; CHECK-NEXT: [[H_I1:%.*]] = extractelement <2 x half> [[H]], i64 1 +; CHECK-NEXT: [[R_I1:%.*]] = call { half, i32 } @llvm.frexp.f16.i32(half [[H_I1]]) +; CHECK-NEXT: [[E1_ELEM1:%.*]] = extractvalue { half, i32 } [[R_I0]], 1 +; CHECK-NEXT: [[E1_ELEM11:%.*]] = extractvalue { half, i32 } [[R_I1]], 1 +; CHECK-NEXT: [[E1_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[E1_ELEM1]], i64 0 +; CHECK-NEXT: [[E1:%.*]] = insertelement <2 x i32> [[E1_UPTO0]], i32 [[E1_ELEM11]], i64 1 +; CHECK-NEXT: ret <2 x i32> [[E1]] +; + %r = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x half> %h) + %e1 = extractvalue { <2 x half>, <2 x i32> } %r, 1 + ret <2 x i32> %e1 +} + +define noundef <2 x float> @test_vector_float_frexp_int(<2 x float> noundef %f) { +; CHECK-LABEL: define noundef <2 x float> @test_vector_float_frexp_int( +; CHECK-SAME: <2 x float> noundef [[F:%.*]]) { +; CHECK-NEXT: [[F_I0:%.*]] = extractelement <2 x float> [[F]], i64 0 +; CHECK-NEXT: [[DOTI0:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[F_I0]]) +; CHECK-NEXT: [[F_I1:%.*]] = extractelement <2 x float> [[F]], i64 1 +; CHECK-NEXT: [[DOTI1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[F_I1]]) +; CHECK-NEXT: [[DOTELEM0:%.*]] = extractvalue { float, i32 } [[DOTI0]], 0 +; CHECK-NEXT: [[DOTELEM01:%.*]] = extractvalue { float, i32 } [[DOTI1]], 0 +; CHECK-NEXT: [[DOTUPTO010:%.*]] = insertelement <2 x float> poison, float [[DOTELEM0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[DOTUPTO010]], float [[DOTELEM01]], i64 1 +; CHECK-NEXT: [[DOTELEM1:%.*]] = extractvalue { float, i32 } [[DOTI0]], 1 +; CHECK-NEXT: [[DOTELEM12:%.*]] = extractvalue { float, i32 } [[DOTI1]], 1 +; CHECK-NEXT: ret <2 x float> [[TMP1]] +; + %1 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x float> %f) + %2 = extractvalue { <2 x float>, <2 x i32> } %1, 0 + %3 = extractvalue { <2 x float>, <2 x i32> } %1, 1 + ret <2 x float> %2 +} + +define noundef <2 x double> @test_vector_double_frexp_int(<2 x double> noundef %d) { +; CHECK-LABEL: define noundef <2 x double> @test_vector_double_frexp_int( +; CHECK-SAME: <2 x double> noundef [[D:%.*]]) { +; CHECK-NEXT: [[D_I0:%.*]] = extractelement <2 x double> [[D]], i64 0 +; CHECK-NEXT: [[DOTI0:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[D_I0]]) +; CHECK-NEXT: [[D_I1:%.*]] = extractelement <2 x double> [[D]], i64 1 +; CHECK-NEXT: [[DOTI1:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[D_I1]]) +; CHECK-NEXT: [[DOTELEM0:%.*]] = extractvalue { double, i32 } [[DOTI0]], 0 +; CHECK-NEXT: [[DOTELEM01:%.*]] = extractvalue { double, i32 } [[DOTI1]], 0 +; CHECK-NEXT: [[DOTUPTO010:%.*]] = insertelement <2 x double> poison, double [[DOTELEM0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[DOTUPTO010]], double [[DOTELEM01]], i64 1 +; CHECK-NEXT: [[DOTELEM1:%.*]] = extractvalue { double, i32 } [[DOTI0]], 1 +; CHECK-NEXT: [[DOTELEM12:%.*]] = extractvalue { double, i32 } [[DOTI1]], 1 +; CHECK-NEXT: ret <2 x double> [[TMP1]] +; + %1 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %d) + %2 = extractvalue { <2 x double>, <2 x i32> } %1, 0 + %3 = extractvalue { <2 x double>, <2 x i32> } %1, 1 + ret <2 x double> %2 +} -- GitLab From 7eb8238a32516008476b717bc6a2be8c59f7f535 Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Mon, 21 Oct 2024 09:58:59 -0700 Subject: [PATCH 261/511] [TableGen] Handle Windows line endings in x86-fold-tables.td test (#112997) The x86-fold-tables.td has been failing for me and [in CI](https://buildkite.com/llvm-project/github-pull-requests/builds/111277#0192a122-c5c9-4e4e-bc5b-7532fec99ae4) if Git happens to decide to check out the baseline file with Windows line endings. This fix for this is to add the `--strip-trailing-cr` option to diff to normalize the line endings before comparing them. --- llvm/test/TableGen/x86-fold-tables.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/TableGen/x86-fold-tables.td b/llvm/test/TableGen/x86-fold-tables.td index d6e59bea0000..b4edcf6bbfd1 100644 --- a/llvm/test/TableGen/x86-fold-tables.td +++ b/llvm/test/TableGen/x86-fold-tables.td @@ -4,4 +4,4 @@ // 2. cp x86-fold-tables.inc // RUN: llvm-tblgen -gen-x86-fold-tables -asmwriternum=1 %p/../../lib/Target/X86/X86.td -I %p/../../lib/Target/X86 -I %p/../../include -o %t -// RUN: diff %p/x86-fold-tables.inc %t +// RUN: diff --strip-trailing-cr %p/x86-fold-tables.inc %t -- GitLab From 8ae39c8e34de2d24c46827b324c76bac845c18b0 Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Mon, 21 Oct 2024 10:02:18 -0700 Subject: [PATCH 262/511] [MC] Fix llvm-mc unterminated string constants warning for Windows (#112995) #98060 introduced a warning for unterminated string constants, however it was only checking for `\n` which means that it produced strange results on Windows (always blaming column 1) including having the [associated test fail](https://buildkite.com/llvm-project/github-pull-requests/builds/111277#0192a122-c5c9-4e4e-bc5b-7532fec99ae4) if Git happened to use Windows newlines when creating the file. This fix for this is to detect both `\r` and `\n`, but don't double-warn for Windows newlines. --- llvm/lib/MC/MCParser/AsmParser.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 3f55d8a66bc2..4774e5112af5 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3037,7 +3037,11 @@ bool AsmParser::parseEscapedString(std::string &Data) { StringRef Str = getTok().getStringContents(); for (unsigned i = 0, e = Str.size(); i != e; ++i) { if (Str[i] != '\\') { - if (Str[i] == '\n') { + if ((Str[i] == '\n') || (Str[i] == '\r')) { + // Don't double-warn for Windows newlines. + if ((Str[i] == '\n') && (i > 0) && (Str[i - 1] == '\r')) + continue; + SMLoc NewlineLoc = SMLoc::getFromPointer(Str.data() + i); if (Warning(NewlineLoc, "unterminated string; newline inserted")) return true; -- GitLab From 766bd6f4d05a4b52892be4f1b740e67053a22ee6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 10:35:53 -0700 Subject: [PATCH 263/511] [AMDGPU] Avoid repeated map lookups (NFC) (#112819) --- llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 7c7e0204b176..77b4f25021c7 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -878,14 +878,11 @@ void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum]; SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum]; - std::map, unsigned>::iterator Pos = - ColorCombinations.find(SUColors); - if (Pos != ColorCombinations.end()) { - CurrentColoring[SU.NodeNum] = Pos->second; - } else { - CurrentColoring[SU.NodeNum] = NextNonReservedID; - ColorCombinations[SUColors] = NextNonReservedID++; - } + auto [Pos, Inserted] = + ColorCombinations.try_emplace(SUColors, NextNonReservedID); + CurrentColoring[SU.NodeNum] = Pos->second; + if (Inserted) + NextNonReservedID++; } } -- GitLab From eaa7b385368fa7e3dad9b95411d04be55e71494e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 21 Oct 2024 10:36:19 -0700 Subject: [PATCH 264/511] [Transforms] Avoid repeated hash lookups (NFC) (#113120) --- llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index e323b391179e..eaf58ea8dd9d 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1462,9 +1462,12 @@ public: if (!CanBeFlattened(Op)) return; - if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) { - ShapeMap[Op] = ShapeMap[Op].t(); - return; + if (match(Op, m_BinOp())) { + auto It = ShapeMap.find(Op); + if (It != ShapeMap.end()) { + It->second = It->second.t(); + return; + } } FusedInsts.insert(cast(Op)); -- GitLab From 30a402833f50b14148c8b963f3ffaaeaeea5fd78 Mon Sep 17 00:00:00 2001 From: Augusto Noronha Date: Mon, 21 Oct 2024 10:35:20 -0700 Subject: [PATCH 265/511] [lldb][NFC] Fix doxygen comment on top of GetMangledTypeName --- lldb/include/lldb/Symbol/TypeSystem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index 416445a60bd0..b06bfa583bad 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -237,8 +237,8 @@ public: virtual ConstString GetDisplayTypeName(lldb::opaque_compiler_type_t type) = 0; - // Defaults to GetTypeName(type). Override if your language desires - // specialized behavior. + /// Defaults to GetTypeName(type). Override if your language desires + /// specialized behavior. virtual ConstString GetMangledTypeName(lldb::opaque_compiler_type_t type); virtual uint32_t -- GitLab From ab07fc832009b678c0b24392ad7e02a8e5dd3932 Mon Sep 17 00:00:00 2001 From: Yijia Gu Date: Mon, 21 Oct 2024 10:41:12 -0700 Subject: [PATCH 266/511] [mlir][bazel] add missing dep in OpenAccTransforms --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 779609340d72..0c7ed4201735 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -10452,6 +10452,7 @@ cc_library( ":OpenACCPassIncGen", ":Pass", ":TransformUtils", + "//llvm:Support", ], ) -- GitLab From 2c331b35712e0fad93cf804674196b7c0e47ebd9 Mon Sep 17 00:00:00 2001 From: Yijia Gu Date: Mon, 21 Oct 2024 10:43:55 -0700 Subject: [PATCH 267/511] [mlir][bazel] remove tab blank in OpenAccTransforms --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 0c7ed4201735..00254ba6e99b 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -10452,7 +10452,7 @@ cc_library( ":OpenACCPassIncGen", ":Pass", ":TransformUtils", - "//llvm:Support", + "//llvm:Support", ], ) -- GitLab From 4de708e32e31ac32b924dfeb020086636700c0f7 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 21 Oct 2024 10:45:55 -0700 Subject: [PATCH 268/511] [lldb] Remove stack_logging.h (#112987) This file is covered under the Apple open source license rather than the LLVM license. Presumably this was an oversight, but it doesn't really matter as this file is unused. Remove it altogether. --- .../debugserver.xcodeproj/project.pbxproj | 1 - .../debugserver/source/MacOSX/stack_logging.h | 158 ------------------ 2 files changed, 159 deletions(-) delete mode 100644 lldb/tools/debugserver/source/MacOSX/stack_logging.h diff --git a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj index 79175c6d9dd7..c25eabcbadab 100644 --- a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj +++ b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj @@ -163,7 +163,6 @@ 456F67721AD46CE9002850C2 /* debugserver-nonui */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "debugserver-nonui"; sourceTree = BUILT_PRODUCTS_DIR; }; 49D404611E39260F00570CDC /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 49F5301213316D7F008956F6 /* MachRegisterStatesX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesX86_64.h; sourceTree = ""; }; - 9457ECF61419864100DFE7D8 /* stack_logging.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = stack_logging.h; sourceTree = ""; }; 9684D93A29FCA1E10046D45E /* debugserver-macosx-private-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; name = "debugserver-macosx-private-entitlements.plist"; path = "resources/debugserver-macosx-private-entitlements.plist"; sourceTree = SOURCE_ROOT; }; 9684D93B29FCA1E10046D45E /* lldb-debugserver-Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; name = "lldb-debugserver-Info.plist"; path = "resources/lldb-debugserver-Info.plist"; sourceTree = SOURCE_ROOT; }; 9684D93C29FCA1E10046D45E /* debugserver-macosx-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; name = "debugserver-macosx-entitlements.plist"; path = "resources/debugserver-macosx-entitlements.plist"; sourceTree = SOURCE_ROOT; }; diff --git a/lldb/tools/debugserver/source/MacOSX/stack_logging.h b/lldb/tools/debugserver/source/MacOSX/stack_logging.h deleted file mode 100644 index 5209e38a08ea..000000000000 --- a/lldb/tools/debugserver/source/MacOSX/stack_logging.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 1999-2007 Apple Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ - -#ifndef malloc_history_test_stack_logging_h -#define malloc_history_test_stack_logging_h - -#import - -#define stack_logging_type_free 0 -#define stack_logging_type_generic \ - 1 /* anything that is not allocation/deallocation */ -#define stack_logging_type_alloc 2 /* malloc, realloc, etc... */ -#define stack_logging_type_dealloc 4 /* free, realloc, etc... */ - -// Following flags are absorbed by stack_logging_log_stack() -#define stack_logging_flag_zone 8 /* NSZoneMalloc, etc... */ -#define stack_logging_flag_calloc 16 /* multiply arguments to get the size */ -#define stack_logging_flag_object \ - 32 /* NSAllocateObject(Class, extraBytes, zone) */ -#define stack_logging_flag_cleared 64 /* for NewEmptyHandle */ -#define stack_logging_flag_handle 128 /* for Handle (de-)allocation routines \ - */ -#define stack_logging_flag_set_handle_size \ - 256 /* (Handle, newSize) treated specially */ - -/* Macro used to disguise addresses so that leak finding can work */ -#define STACK_LOGGING_DISGUISE(address) \ - ((address) ^ 0x00005555) /* nicely idempotent */ - -extern "C" int - stack_logging_enable_logging; /* when clear, no logging takes place */ -extern "C" int stack_logging_dontcompact; /* default is to compact; when set - does not compact alloc/free logs; - useful for tracing history */ - -extern "C" void stack_logging_log_stack(unsigned type, unsigned arg1, - unsigned arg2, unsigned arg3, - unsigned result, - unsigned num_hot_to_skip); -/* This is the old log-to-memory logger, which is now deprecated. It remains - * for compatibility with performance tools that haven't been updated to - * disk_stack_logging_log_stack() yet. */ - -extern "C" void -__disk_stack_logging_log_stack(uint32_t type_flags, uintptr_t zone_ptr, - uintptr_t size, uintptr_t ptr_arg, - uintptr_t return_val, uint32_t num_hot_to_skip); -/* Fits as the malloc_logger; logs malloc/free/realloc events and can log custom - * events if called directly */ - -/* 64-bit-aware stack log access. */ -typedef struct { - uint32_t type_flags; - uint64_t stack_identifier; - uint64_t argument; - mach_vm_address_t address; -} mach_stack_logging_record_t; - -extern "C" kern_return_t -__mach_stack_logging_get_frames(task_t task, mach_vm_address_t address, - mach_vm_address_t *stack_frames_buffer, - uint32_t max_stack_frames, uint32_t *count); -/* Gets the last allocation record (malloc, realloc, or free) about address */ - -extern "C" kern_return_t __mach_stack_logging_enumerate_records( - task_t task, mach_vm_address_t address, - void enumerator(mach_stack_logging_record_t, void *), void *context); -/* Applies enumerator to all records involving address sending context as - * enumerator's second parameter; if !address, applies enumerator to all records - */ - -extern "C" kern_return_t __mach_stack_logging_frames_for_uniqued_stack( - task_t task, uint64_t stack_identifier, - mach_vm_address_t *stack_frames_buffer, uint32_t max_stack_frames, - uint32_t *count); -/* Given a uniqued_stack fills stack_frames_buffer */ - -#pragma mark - -#pragma mark Legacy - -/* The following is the old 32-bit-only, in-process-memory stack logging. This - * is deprecated and clients should move to the above 64-bit-aware disk stack - * logging SPI. */ - -typedef struct { - unsigned type; - unsigned uniqued_stack; - unsigned argument; - unsigned address; /* disguised, to avoid confusing leaks */ -} stack_logging_record_t; - -typedef struct { - unsigned overall_num_bytes; - unsigned num_records; - unsigned lock; /* 0 means OK to lock; used for inter-process locking */ - unsigned *uniquing_table; /* allocated using vm_allocate() */ - /* hashtable organized as (PC, uniqued parent) - Only the second half of the table is active - To enable us to grow dynamically */ - unsigned uniquing_table_num_pages; /* number of pages of the table */ - unsigned extra_retain_count; /* not used by stack_logging_log_stack */ - unsigned filler[2]; /* align to cache lines for better performance */ - stack_logging_record_t records[0]; /* records follow here */ -} stack_logging_record_list_t; - -extern "C" stack_logging_record_list_t *stack_logging_the_record_list; -/* This is the global variable containing all logs */ - -extern "C" kern_return_t -stack_logging_get_frames(task_t task, memory_reader_t reader, - vm_address_t address, - vm_address_t *stack_frames_buffer, - unsigned max_stack_frames, unsigned *num_frames); -/* Gets the last record in stack_logging_the_record_list about address */ - -#define STACK_LOGGING_ENUMERATION_PROVIDED \ - 1 // temporary to avoid dependencies between projects - -extern "C" kern_return_t stack_logging_enumerate_records( - task_t task, memory_reader_t reader, vm_address_t address, - void enumerator(stack_logging_record_t, void *), void *context); -/* Gets all the records about address; - If !address, gets all records */ - -extern "C" kern_return_t stack_logging_frames_for_uniqued_stack( - task_t task, memory_reader_t reader, unsigned uniqued_stack, - vm_address_t *stack_frames_buffer, unsigned max_stack_frames, - unsigned *num_frames); -/* Given a uniqued_stack fills stack_frames_buffer */ - -extern "C" void thread_stack_pcs(vm_address_t *buffer, unsigned max, - unsigned *num); -/* Convenience to fill buffer with the PCs of the frames, starting with the hot - frames; - num: returned number of frames - */ - -#endif -- GitLab From ed5072ee28809abf0f140ca15df549a418bb5c69 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Mon, 21 Oct 2024 10:46:21 -0700 Subject: [PATCH 269/511] [NFC][lld-macho] Generate test bodies for icf-safe-thunk tests (#111927) Autogenerate `.ll` code from cpp code in some `-icf-safe-thunk` tests using `update_test_body.py` ``` PATH=build/bin:$PATH llvm/utils/update_test_body.py lld/test/MachO/icf-safe-thunks.ll lld/test/MachO/icf-safe-thunks-dwarf.ll ``` https://llvm.org/docs/TestingGuide.html#elaborated-tests I recently became aware of this tool and I wanted to practice using it. This also allows to remove the custom instructions to generate the `.ll` code. --- lld/test/MachO/icf-safe-thunks-dwarf.ll | 147 ++++++------- lld/test/MachO/icf-safe-thunks.ll | 261 ++++++++++-------------- 2 files changed, 177 insertions(+), 231 deletions(-) diff --git a/lld/test/MachO/icf-safe-thunks-dwarf.ll b/lld/test/MachO/icf-safe-thunks-dwarf.ll index 74f3fb7a033d..1e4422a33132 100644 --- a/lld/test/MachO/icf-safe-thunks-dwarf.ll +++ b/lld/test/MachO/icf-safe-thunks-dwarf.ll @@ -1,17 +1,18 @@ +; NOTE: Code has been autogenerated by utils/update_test_body.py ; REQUIRES: aarch64 -;;; Build the -; RUN: rm -rf %t; mkdir %t -; RUN: llc -filetype=obj %s -O3 -o %t/icf-obj-safe-thunks-dwarf.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig -; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/icf-safe-dwarf.dylib %t/icf-obj-safe-thunks-dwarf.o +; RUN: rm -rf %t && split-file %s %t + +; RUN: llc -filetype=obj %t/a.ll -O3 -o %t/a.o -enable-machine-outliner=never -mtriple arm64-apple-macos -addrsig +; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks -dylib -o %t/a.dylib %t/a.o ;;; Check that we generate valid dSYM -; RUN: dsymutil %t/icf-safe-dwarf.dylib -o %t/icf-safe.dSYM -; RUN: llvm-dwarfdump --verify %t/icf-safe.dSYM | FileCheck %s --check-prefix=VERIFY-DSYM +; RUN: dsymutil %t/a.dylib -o %t/a.dSYM +; RUN: llvm-dwarfdump --verify %t/a.dSYM | FileCheck %s --check-prefix=VERIFY-DSYM ; VERIFY-DSYM: No errors. ;;; Check that we don't generate STABS entries (N_FUN) for ICF'ed function thunks -; RUN: dsymutil -s %t/icf-safe-dwarf.dylib | FileCheck %s --check-prefix=VERIFY-STABS +; RUN: dsymutil -s %t/a.dylib | FileCheck %s --check-prefix=VERIFY-STABS ; VERIFY-STABS-NOT: N_FUN{{.*}}_func_B ; VERIFY-STABS-NOT: N_FUN{{.*}}_func_C @@ -19,97 +20,85 @@ ; VERIFY-STABS: N_FUN{{.*}}_func_A ; VERIFY-STABS: N_FUN{{.*}}_take_func_addr +;--- a.cpp +#define ATTR __attribute__((noinline)) extern "C" +typedef unsigned long long ULL; + +ATTR int func_A() { return 1; } +ATTR int func_B() { return 1; } +ATTR int func_C() { return 1; } + +ATTR ULL take_func_addr() { + ULL val = 0; + val += (ULL)(void*)func_A; + val += (ULL)(void*)func_B; + val += (ULL)(void*)func_C; + return val; +} + +;--- gen +clang -target arm64-apple-macos11.0 -S -emit-llvm a.cpp -O3 -g -o - -; ModuleID = 'icf-safe-thunks-dwarf.cpp' -source_filename = "icf-safe-thunks-dwarf.cpp" +;--- a.ll +; ModuleID = 'a.cpp' +source_filename = "a.cpp" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" target triple = "arm64-apple-macosx11.0.0" -; Function Attrs: mustprogress noinline nounwind optnone ssp uwtable(sync) -define i32 @func_A() #0 !dbg !13 { -entry: - ret i32 1 +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) +define noundef i32 @func_A() #0 !dbg !12 { + ret i32 1, !dbg !16 } -; Function Attrs: mustprogress noinline nounwind optnone ssp uwtable(sync) -define i32 @func_B() #0 !dbg !18 { -entry: - ret i32 1 +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) +define noundef i32 @func_B() #0 !dbg !17 { + ret i32 1, !dbg !18 } -; Function Attrs: mustprogress noinline nounwind optnone ssp uwtable(sync) -define i32 @func_C() #0 !dbg !20 { -entry: - ret i32 1 +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) +define noundef i32 @func_C() #0 !dbg !19 { + ret i32 1, !dbg !20 } -; Function Attrs: mustprogress noinline nounwind optnone ssp uwtable(sync) -define i64 @take_func_addr() #0 !dbg !22 { -entry: - %val = alloca i64, align 8 - store i64 0, ptr %val, align 8 - %0 = load i64, ptr %val, align 8 - %add = add i64 %0, ptrtoint (ptr @func_A to i64) - store i64 %add, ptr %val, align 8 - %1 = load i64, ptr %val, align 8 - %add1 = add i64 %1, ptrtoint (ptr @func_B to i64) - store i64 %add1, ptr %val, align 8 - %2 = load i64, ptr %val, align 8 - %add2 = add i64 %2, ptrtoint (ptr @func_C to i64) - store i64 %add2, ptr %val, align 8 - %3 = load i64, ptr %val, align 8 - ret i64 %3 +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) +define noundef i64 @take_func_addr() local_unnamed_addr #0 !dbg !21 { + #dbg_value(i64 0, !25, !DIExpression(), !26) + #dbg_value(i64 ptrtoint (ptr @func_A to i64), !25, !DIExpression(), !26) + #dbg_value(i64 add (i64 ptrtoint (ptr @func_A to i64), i64 ptrtoint (ptr @func_B to i64)), !25, !DIExpression(), !26) + #dbg_value(i64 add (i64 add (i64 ptrtoint (ptr @func_A to i64), i64 ptrtoint (ptr @func_B to i64)), i64 ptrtoint (ptr @func_C to i64)), !25, !DIExpression(), !26) + ret i64 add (i64 add (i64 ptrtoint (ptr @func_A to i64), i64 ptrtoint (ptr @func_B to i64)), i64 ptrtoint (ptr @func_C to i64)), !dbg !27 } -attributes #0 = { noinline nounwind } +attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,+zcm,+zcz" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!6, !7, !8, !9, !10, !11} -!llvm.ident = !{!12} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") -!1 = !DIFile(filename: "icf-safe-thunks-dwarf.cpp", directory: "/tmp/test") +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") +!1 = !DIFile(filename: "a.cpp", directory: "/proc/self/cwd") +!2 = !{!3, !5} +!3 = !DIDerivedType(tag: DW_TAG_typedef, name: "ULL", file: !1, line: 2, baseType: !4) +!4 = !DIBasicType(name: "unsigned long long", size: 64, encoding: DW_ATE_unsigned) +!5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) !6 = !{i32 7, !"Dwarf Version", i32 4} !7 = !{i32 2, !"Debug Info Version", i32 3} !8 = !{i32 1, !"wchar_size", i32 4} !9 = !{i32 8, !"PIC Level", i32 2} !10 = !{i32 7, !"uwtable", i32 1} !11 = !{i32 7, !"frame-pointer", i32 1} -!12 = !{!"clang version 20.0.0"} -!13 = distinct !DISubprogram(name: "func_A", scope: !1, file: !1, line: 4, type: !14, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) -!14 = !DISubroutineType(types: !15) -!15 = !{} -!18 = distinct !DISubprogram(name: "func_B", scope: !1, file: !1, line: 5, type: !14, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) -!20 = distinct !DISubprogram(name: "func_C", scope: !1, file: !1, line: 6, type: !14, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) -!22 = distinct !DISubprogram(name: "take_func_addr", scope: !1, file: !1, line: 8, type: !14, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) - - - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;; Generate the above LLVM IR with the below script ;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; #!/bin/bash -; set -ex -; TOOLCHAIN_BIN="llvm-project/build/Debug/bin" -; -; # Create icf-safe-thunks-dwarf.cpp file -; cat > icf-safe-thunks-dwarf.cpp < icf-safe-thunks.cpp < Date: Mon, 21 Oct 2024 10:52:15 -0700 Subject: [PATCH 270/511] [nfc][msan] Reorder flags in RUN: (#113196) --- .../test/Instrumentation/MemorySanitizer/msan_basic.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll index fe7637918524..263ffe553d08 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll @@ -1,8 +1,8 @@ -; RUN: opt < %s -msan-check-access-address=0 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK,NOORIGINS --implicit-check-not="call void @__msan_warning" -; RUN: opt < %s --passes='module(msan)' -msan-check-access-address=0 -S | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK,NOORIGINS --implicit-check-not="call void @__msan_warning" -; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CHECK,ORIGINS %s --implicit-check-not="call void @__msan_warning" -; RUN: opt < %s -passes='module(msan)' -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CHECK,ORIGINS %s --implicit-check-not="call void @__msan_warning" -; RUN: opt < %s -passes='module(msan)' -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CHECK-CALLS %s --implicit-check-not="call void @__msan_warning" +; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,NOORIGINS +; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,NOORIGINS +; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,ORIGINS +; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,ORIGINS +; RUN: opt < %s -S -passes='module(msan)' -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK-CALLS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -- GitLab From cafeacff2c6367a229aa8b65be99835177f5c3be Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 21 Oct 2024 10:53:33 -0700 Subject: [PATCH 271/511] [nfc][msan] Remove RUN: duplicates (#113197) --- llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll index 263ffe553d08..74517eea4c11 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll @@ -1,6 +1,4 @@ ; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,NOORIGINS -; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,NOORIGINS -; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,ORIGINS ; RUN: opt < %s -S -passes='module(msan)' -msan-check-access-address=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK,ORIGINS ; RUN: opt < %s -S -passes='module(msan)' -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 | FileCheck %s --allow-deprecated-dag-overlap --implicit-check-not="call void @__msan_warning" --check-prefixes=CHECK-CALLS -- GitLab From 7dc2542ac24fcae89dfd179fa58c4ec4fb959e2b Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 21 Oct 2024 10:55:08 -0700 Subject: [PATCH 272/511] [nfc][msan] Fix old typo in test (#113198) --- llvm/test/Instrumentation/MemorySanitizer/pr32842.ll | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll b/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll index 381ab1b3a435..e64710877161 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll @@ -1,7 +1,8 @@ ; Regression test for https://bugs.llvm.org/show_bug.cgi?id=32842 ; ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s -;target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define zeroext i1 @_Z1fii(i32 %x, i32 %y) sanitize_memory { -- GitLab From f58ce1152703ca753794b8cef36da30bd2668d0f Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 21 Oct 2024 11:02:38 -0700 Subject: [PATCH 273/511] [NFC][TableGen] Use auto when initializing variables with cast<> (#113171) Use `auto` when initializing a variable with `cast<>`. Remove some unneeded `const_cast` (since all Init pointers are now const). --- llvm/include/llvm/TableGen/Record.h | 12 +- llvm/lib/TableGen/Record.cpp | 328 ++++++++++++++-------------- llvm/lib/TableGen/SetTheory.cpp | 2 +- llvm/lib/TableGen/TGParser.cpp | 150 ++++++------- 4 files changed, 243 insertions(+), 249 deletions(-) diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index 63267b7633f6..78b44cfc649a 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -401,9 +401,7 @@ public: /// variables which may not be defined at the time the expression is formed. /// If a value is set for the variable later, this method will be called on /// users of the value to allow the value to propagate out. - virtual const Init *resolveReferences(Resolver &R) const { - return const_cast(this); - } + virtual const Init *resolveReferences(Resolver &R) const { return this; } /// Get the \p Init value of the specified bit. virtual const Init *getBit(unsigned Bit) const = 0; @@ -475,9 +473,7 @@ public: const Init *getCastTo(const RecTy *Ty) const override; const Init *convertInitializerTo(const RecTy *Ty) const override; - const Init *getBit(unsigned Bit) const override { - return const_cast(this); - } + const Init *getBit(unsigned Bit) const override { return this; } /// Is this a complete value with no unset (uninitialized) subvalues? bool isComplete() const override { return false; } @@ -579,7 +575,7 @@ public: const Init *getBit(unsigned Bit) const override { assert(Bit < 1 && "Bit index out of range!"); - return const_cast(this); + return this; } bool isConcrete() const override { return true; } @@ -1318,7 +1314,7 @@ public: const Init *getBit(unsigned B) const override { assert(B < 1 && "Bit index out of range!"); - return const_cast(this); + return this; } }; diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index f8ea88375c48..9241fb3d8e72 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -156,7 +156,7 @@ const BitRecTy *BitRecTy::get(RecordKeeper &RK) { bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{ if (RecTy::typeIsConvertibleTo(RHS) || RHS->getRecTyKind() == IntRecTyKind) return true; - if (const BitsRecTy *BitsTy = dyn_cast(RHS)) + if (const auto *BitsTy = dyn_cast(RHS)) return BitsTy->getNumBits() == 1; return false; } @@ -215,7 +215,7 @@ bool ListRecTy::typeIsConvertibleTo(const RecTy *RHS) const { } bool ListRecTy::typeIsA(const RecTy *RHS) const { - if (const ListRecTy *RHSl = dyn_cast(RHS)) + if (const auto *RHSl = dyn_cast(RHS)) return getElementType()->typeIsA(RHSl->getElementType()); return false; } @@ -309,7 +309,7 @@ bool RecordRecTy::typeIsConvertibleTo(const RecTy *RHS) const { if (this == RHS) return true; - const RecordRecTy *RTy = dyn_cast(RHS); + const auto *RTy = dyn_cast(RHS); if (!RTy) return false; @@ -344,8 +344,8 @@ const RecTy *llvm::resolveTypes(const RecTy *T1, const RecTy *T2) { if (T1 == T2) return T1; - if (const RecordRecTy *RecTy1 = dyn_cast(T1)) { - if (const RecordRecTy *RecTy2 = dyn_cast(T2)) + if (const auto *RecTy1 = dyn_cast(T1)) { + if (const auto *RecTy2 = dyn_cast(T2)) return resolveRecordTypes(RecTy1, RecTy2); } @@ -357,8 +357,8 @@ const RecTy *llvm::resolveTypes(const RecTy *T1, const RecTy *T2) { if (T2->typeIsConvertibleTo(T1)) return T1; - if (const ListRecTy *ListTy1 = dyn_cast(T1)) { - if (const ListRecTy *ListTy2 = dyn_cast(T2)) { + if (const auto *ListTy1 = dyn_cast(T1)) { + if (const auto *ListTy2 = dyn_cast(T2)) { const RecTy *NewType = resolveTypes(ListTy1->getElementType(), ListTy2->getElementType()); if (NewType) @@ -433,7 +433,7 @@ const Init *ArgumentInit::resolveReferences(Resolver &R) const { if (NewValue != Value) return cloneWithValue(NewValue); - return const_cast(this); + return this; } BitInit *BitInit::get(RecordKeeper &RK, bool V) { @@ -442,7 +442,7 @@ BitInit *BitInit::get(RecordKeeper &RK, bool V) { const Init *BitInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) - return const_cast(this); + return this; if (isa(Ty)) return IntInit::get(getRecordKeeper(), getValue()); @@ -450,7 +450,7 @@ const Init *BitInit::convertInitializerTo(const RecTy *Ty) const { if (auto *BRT = dyn_cast(Ty)) { // Can only convert single bit. if (BRT->getNumBits() == 1) - return BitsInit::get(getRecordKeeper(), const_cast(this)); + return BitsInit::get(getRecordKeeper(), this); } return nullptr; @@ -496,7 +496,7 @@ const Init *BitsInit::convertInitializerTo(const RecTy *Ty) const { // If the number of bits is right, return it. Otherwise we need to expand // or truncate. if (getNumBits() != BRT->getNumBits()) return nullptr; - return const_cast(this); + return this; } if (isa(Ty)) { @@ -563,7 +563,7 @@ const Init *BitsInit::resolveReferences(Resolver &R) const { const Init *CurBit = getBit(i); const Init *NewBit = CurBit; - if (const VarBitInit *CurBitVar = dyn_cast(CurBit)) { + if (const auto *CurBitVar = dyn_cast(CurBit)) { if (CurBitVar->getBitVar() != CachedBitVarRef) { CachedBitVarRef = CurBitVar->getBitVar(); CachedBitVarResolved = CachedBitVarRef->resolveReferences(R); @@ -606,7 +606,7 @@ static bool canFitInBitfield(int64_t Value, unsigned NumBits) { const Init *IntInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) - return const_cast(this); + return this; if (isa(Ty)) { int64_t Val = getValue(); @@ -614,7 +614,7 @@ const Init *IntInit::convertInitializerTo(const RecTy *Ty) const { return BitInit::get(getRecordKeeper(), Val != 0); } - if (auto *BRT = dyn_cast(Ty)) { + if (const auto *BRT = dyn_cast(Ty)) { int64_t Value = getValue(); // Make sure this bitfield is large enough to hold the integer value. if (!canFitInBitfield(Value, BRT->getNumBits())) @@ -657,11 +657,11 @@ std::string AnonymousNameInit::getAsString() const { } const Init *AnonymousNameInit::resolveReferences(Resolver &R) const { - auto *Old = const_cast(static_cast(this)); + auto *Old = this; auto *New = R.resolve(Old); New = New ? New : Old; if (R.isFinal()) - if (auto *Anonymous = dyn_cast(New)) + if (const auto *Anonymous = dyn_cast(New)) return Anonymous->getNameInit(); return New; } @@ -679,7 +679,7 @@ const StringInit *StringInit::get(RecordKeeper &RK, StringRef V, const Init *StringInit::convertInitializerTo(const RecTy *Ty) const { if (isa(Ty)) - return const_cast(this); + return this; return nullptr; } @@ -723,9 +723,9 @@ void ListInit::Profile(FoldingSetNodeID &ID) const { const Init *ListInit::convertInitializerTo(const RecTy *Ty) const { if (getType() == Ty) - return const_cast(this); + return this; - if (auto *LRT = dyn_cast(Ty)) { + if (const auto *LRT = dyn_cast(Ty)) { SmallVector Elements; Elements.reserve(getValues().size()); @@ -742,7 +742,7 @@ const Init *ListInit::convertInitializerTo(const RecTy *Ty) const { return nullptr; if (!Changed) - return const_cast(this); + return this; return ListInit::get(Elements, ElementType); } @@ -751,7 +751,7 @@ const Init *ListInit::convertInitializerTo(const RecTy *Ty) const { const Record *ListInit::getElementAsRecord(unsigned i) const { assert(i < NumValues && "List element index out of range!"); - const DefInit *DI = dyn_cast(getElement(i)); + const auto *DI = dyn_cast(getElement(i)); if (!DI) PrintFatalError("Expected record in list!"); return DI->getDef(); @@ -802,7 +802,7 @@ std::string ListInit::getAsString() const { const Init *OpInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) - return const_cast(this); + return this; return VarBitInit::get(this, Bit); } @@ -853,27 +853,27 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { } break; case TOLOWER: - if (const StringInit *LHSs = dyn_cast(LHS)) + if (const auto *LHSs = dyn_cast(LHS)) return StringInit::get(RK, LHSs->getValue().lower()); break; case TOUPPER: - if (const StringInit *LHSs = dyn_cast(LHS)) + if (const auto *LHSs = dyn_cast(LHS)) return StringInit::get(RK, LHSs->getValue().upper()); break; case CAST: if (isa(getType())) { - if (const StringInit *LHSs = dyn_cast(LHS)) + if (const auto *LHSs = dyn_cast(LHS)) return LHSs; - if (const DefInit *LHSd = dyn_cast(LHS)) + if (const auto *LHSd = dyn_cast(LHS)) return StringInit::get(RK, LHSd->getAsString()); - if (const IntInit *LHSi = dyn_cast_or_null( + if (const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) return StringInit::get(RK, LHSi->getAsString()); } else if (isa(getType())) { - if (const StringInit *Name = dyn_cast(LHS)) { + if (const auto *Name = dyn_cast(LHS)) { const Record *D = RK.getDef(Name->getValue()); if (!D && CurRec) { // Self-references are allowed, but their resolution is delayed until @@ -918,20 +918,20 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { break; case NOT: - if (const IntInit *LHSi = dyn_cast_or_null( + if (const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) return IntInit::get(RK, LHSi->getValue() ? 0 : 1); break; case HEAD: - if (const ListInit *LHSl = dyn_cast(LHS)) { + if (const auto *LHSl = dyn_cast(LHS)) { assert(!LHSl->empty() && "Empty list in head"); return LHSl->getElement(0); } break; case TAIL: - if (const ListInit *LHSl = dyn_cast(LHS)) { + if (const auto *LHSl = dyn_cast(LHS)) { assert(!LHSl->empty() && "Empty list in tail"); // Note the +1. We can't just pass the result of getValues() // directly. @@ -940,25 +940,25 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { break; case SIZE: - if (const ListInit *LHSl = dyn_cast(LHS)) + if (const auto *LHSl = dyn_cast(LHS)) return IntInit::get(RK, LHSl->size()); - if (const DagInit *LHSd = dyn_cast(LHS)) + if (const auto *LHSd = dyn_cast(LHS)) return IntInit::get(RK, LHSd->arg_size()); - if (const StringInit *LHSs = dyn_cast(LHS)) + if (const auto *LHSs = dyn_cast(LHS)) return IntInit::get(RK, LHSs->getValue().size()); break; case EMPTY: - if (const ListInit *LHSl = dyn_cast(LHS)) + if (const auto *LHSl = dyn_cast(LHS)) return IntInit::get(RK, LHSl->empty()); - if (const DagInit *LHSd = dyn_cast(LHS)) + if (const auto *LHSd = dyn_cast(LHS)) return IntInit::get(RK, LHSd->arg_empty()); - if (const StringInit *LHSs = dyn_cast(LHS)) + if (const auto *LHSs = dyn_cast(LHS)) return IntInit::get(RK, LHSs->getValue().empty()); break; case GETDAGOP: - if (const DagInit *Dag = dyn_cast(LHS)) { + if (const auto *Dag = dyn_cast(LHS)) { // TI is not necessarily a def due to the late resolution in multiclasses, // but has to be a TypedInit. auto *TI = cast(Dag->getOperator()); @@ -974,7 +974,7 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { break; case LOG2: - if (const IntInit *LHSi = dyn_cast_or_null( + if (const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) { int64_t LHSv = LHSi->getValue(); if (LHSv <= 0) { @@ -991,9 +991,8 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { break; case LISTFLATTEN: - if (const ListInit *LHSList = dyn_cast(LHS)) { - const ListRecTy *InnerListTy = - dyn_cast(LHSList->getElementType()); + if (const auto *LHSList = dyn_cast(LHS)) { + const auto *InnerListTy = dyn_cast(LHSList->getElementType()); // list of non-lists, !listflatten() is a NOP. if (!InnerListTy) return LHS; @@ -1003,7 +1002,7 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { std::vector Flattened; // Concatenate elements of all the inner lists. for (const Init *InnerInit : List->getValues()) { - const ListInit *InnerList = dyn_cast(InnerInit); + const auto *InnerList = dyn_cast(InnerInit); if (!InnerList) return std::nullopt; for (const Init *InnerElem : InnerList->getValues()) @@ -1018,7 +1017,7 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { } break; } - return const_cast(this); + return this; } const Init *UnOpInit::resolveReferences(Resolver &R) const { @@ -1098,7 +1097,7 @@ static const StringInit *interleaveStringList(const ListInit *List, const StringInit *Delim) { if (List->size() == 0) return StringInit::get(List->getRecordKeeper(), ""); - const StringInit *Element = dyn_cast(List->getElement(0)); + const auto *Element = dyn_cast(List->getElement(0)); if (!Element) return nullptr; SmallString<80> Result(Element->getValue()); @@ -1106,7 +1105,7 @@ static const StringInit *interleaveStringList(const ListInit *List, for (unsigned I = 1, E = List->size(); I < E; ++I) { Result.append(Delim->getValue()); - const StringInit *Element = dyn_cast(List->getElement(I)); + const auto *Element = dyn_cast(List->getElement(I)); if (!Element) return nullptr; Result.append(Element->getValue()); @@ -1120,7 +1119,7 @@ static const StringInit *interleaveIntList(const ListInit *List, RecordKeeper &RK = List->getRecordKeeper(); if (List->size() == 0) return StringInit::get(RK, ""); - const IntInit *Element = dyn_cast_or_null( + const auto *Element = dyn_cast_or_null( List->getElement(0)->convertInitializerTo(IntRecTy::get(RK))); if (!Element) return nullptr; @@ -1128,7 +1127,7 @@ static const StringInit *interleaveIntList(const ListInit *List, for (unsigned I = 1, E = List->size(); I < E; ++I) { Result.append(Delim->getValue()); - const IntInit *Element = dyn_cast_or_null( + const auto *Element = dyn_cast_or_null( List->getElement(I)->convertInitializerTo(IntRecTy::get(RK))); if (!Element) return nullptr; @@ -1139,8 +1138,8 @@ static const StringInit *interleaveIntList(const ListInit *List, const Init *BinOpInit::getStrConcat(const Init *I0, const Init *I1) { // Shortcut for the common case of concatenating two strings. - if (const StringInit *I0s = dyn_cast(I0)) - if (const StringInit *I1s = dyn_cast(I1)) + if (const auto *I0s = dyn_cast(I0)) + if (const auto *I1s = dyn_cast(I1)) return ConcatStringInits(I0s, I1s); return BinOpInit::get(BinOpInit::STRCONCAT, I0, I1, StringRecTy::get(I0->getRecordKeeper())); @@ -1158,8 +1157,8 @@ const Init *BinOpInit::getListConcat(const TypedInit *LHS, const Init *RHS) { assert(isa(LHS->getType()) && "First arg must be a list"); // Shortcut for the common case of concatenating two lists. - if (const ListInit *LHSList = dyn_cast(LHS)) - if (const ListInit *RHSList = dyn_cast(RHS)) + if (const auto *LHSList = dyn_cast(LHS)) + if (const auto *RHSList = dyn_cast(RHS)) return ConcatListInits(LHSList, RHSList); return BinOpInit::get(BinOpInit::LISTCONCAT, LHS, RHS, LHS->getType()); } @@ -1167,9 +1166,9 @@ const Init *BinOpInit::getListConcat(const TypedInit *LHS, const Init *RHS) { std::optional BinOpInit::CompareInit(unsigned Opc, const Init *LHS, const Init *RHS) const { // First see if we have two bit, bits, or int. - const IntInit *LHSi = dyn_cast_or_null( + const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); - const IntInit *RHSi = dyn_cast_or_null( + const auto *RHSi = dyn_cast_or_null( RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); if (LHSi && RHSi) { @@ -1200,8 +1199,8 @@ std::optional BinOpInit::CompareInit(unsigned Opc, const Init *LHS, } // Next try strings. - const StringInit *LHSs = dyn_cast(LHS); - const StringInit *RHSs = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { bool Result; @@ -1232,8 +1231,8 @@ std::optional BinOpInit::CompareInit(unsigned Opc, const Init *LHS, // Finally, !eq and !ne can be used with records. if (Opc == EQ || Opc == NE) { - const DefInit *LHSd = dyn_cast(LHS); - const DefInit *RHSd = dyn_cast(RHS); + const auto *LHSd = dyn_cast(LHS); + const auto *RHSd = dyn_cast(RHS); if (LHSd && RHSd) return (Opc == EQ) ? LHSd == RHSd : LHSd != RHSd; } @@ -1244,7 +1243,7 @@ std::optional BinOpInit::CompareInit(unsigned Opc, const Init *LHS, static std::optional getDagArgNoByKey(const DagInit *Dag, const Init *Key, std::string &Error) { // Accessor by index - if (const IntInit *Idx = dyn_cast(Key)) { + if (const auto *Idx = dyn_cast(Key)) { int64_t Pos = Idx->getValue(); if (Pos < 0) { // The index is negative. @@ -1264,7 +1263,7 @@ getDagArgNoByKey(const DagInit *Dag, const Init *Key, std::string &Error) { } assert(isa(Key)); // Accessor by name - const StringInit *Name = dyn_cast(Key); + const auto *Name = dyn_cast(Key); auto ArgNo = Dag->getArgNo(Name->getValue()); if (!ArgNo) { // The key is not found. @@ -1277,11 +1276,11 @@ getDagArgNoByKey(const DagInit *Dag, const Init *Key, std::string &Error) { const Init *BinOpInit::Fold(const Record *CurRec) const { switch (getOpcode()) { case CONCAT: { - const DagInit *LHSs = dyn_cast(LHS); - const DagInit *RHSs = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { - const DefInit *LOp = dyn_cast(LHSs->getOperator()); - const DefInit *ROp = dyn_cast(RHSs->getOperator()); + const auto *LOp = dyn_cast(LHSs->getOperator()); + const auto *ROp = dyn_cast(RHSs->getOperator()); if ((!LOp && !isa(LHSs->getOperator())) || (!ROp && !isa(RHSs->getOperator()))) break; @@ -1309,8 +1308,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case LISTCONCAT: { - const ListInit *LHSs = dyn_cast(LHS); - const ListInit *RHSs = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { SmallVector Args; llvm::append_range(Args, *LHSs); @@ -1320,8 +1319,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case LISTSPLAT: { - const TypedInit *Value = dyn_cast(LHS); - const IntInit *Size = dyn_cast(RHS); + const auto *Value = dyn_cast(LHS); + const auto *Size = dyn_cast(RHS); if (Value && Size) { SmallVector Args(Size->getValue(), Value); return ListInit::get(Args, Value->getType()); @@ -1329,8 +1328,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case LISTREMOVE: { - const ListInit *LHSs = dyn_cast(LHS); - const ListInit *RHSs = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *RHSs = dyn_cast(RHS); if (LHSs && RHSs) { SmallVector Args; for (const Init *EltLHS : *LHSs) { @@ -1351,8 +1350,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case LISTELEM: { - auto *TheList = dyn_cast(LHS); - auto *Idx = dyn_cast(RHS); + const auto *TheList = dyn_cast(LHS); + const auto *Idx = dyn_cast(RHS); if (!TheList || !Idx) break; auto i = Idx->getValue(); @@ -1361,8 +1360,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return TheList->getElement(i); } case LISTSLICE: { - auto *TheList = dyn_cast(LHS); - auto *SliceIdxs = dyn_cast(RHS); + const auto *TheList = dyn_cast(LHS); + const auto *SliceIdxs = dyn_cast(RHS); if (!TheList || !SliceIdxs) break; SmallVector Args; @@ -1379,8 +1378,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return ListInit::get(Args, TheList->getElementType()); } case RANGEC: { - auto *LHSi = dyn_cast(LHS); - auto *RHSi = dyn_cast(RHS); + const auto *LHSi = dyn_cast(LHS); + const auto *RHSi = dyn_cast(RHS); if (!LHSi || !RHSi) break; @@ -1411,15 +1410,15 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return ListInit::get(Args, LHSi->getType()); } case STRCONCAT: { - const StringInit *LHSs = dyn_cast(LHS); - const StringInit *RHSs = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *RHSs = dyn_cast(RHS); if (LHSs && RHSs) return ConcatStringInits(LHSs, RHSs); break; } case INTERLEAVE: { - const ListInit *List = dyn_cast(LHS); - const StringInit *Delim = dyn_cast(RHS); + const auto *List = dyn_cast(LHS); + const auto *Delim = dyn_cast(RHS); if (List && Delim) { const StringInit *Result; if (isa(List->getElementType())) @@ -1442,7 +1441,7 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case GETDAGARG: { - const DagInit *Dag = dyn_cast(LHS); + const auto *Dag = dyn_cast(LHS); if (Dag && isa(RHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, RHS, Error); @@ -1452,7 +1451,7 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); const Init *Arg = Dag->getArg(*ArgNo); - if (auto *TI = dyn_cast(Arg)) + if (const auto *TI = dyn_cast(Arg)) if (!TI->getType()->typeIsConvertibleTo(getType())) return UnsetInit::get(Dag->getRecordKeeper()); return Arg; @@ -1460,8 +1459,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case GETDAGNAME: { - const DagInit *Dag = dyn_cast(LHS); - const IntInit *Idx = dyn_cast(RHS); + const auto *Dag = dyn_cast(LHS); + const auto *Idx = dyn_cast(RHS); if (Dag && Idx) { int64_t Pos = Idx->getValue(); if (Pos < 0 || Pos >= Dag->getNumArgs()) { @@ -1479,8 +1478,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { break; } case SETDAGOP: { - const DagInit *Dag = dyn_cast(LHS); - const DefInit *Op = dyn_cast(RHS); + const auto *Dag = dyn_cast(LHS); + const auto *Op = dyn_cast(RHS); if (Dag && Op) { SmallVector Args; SmallVector ArgNames; @@ -1502,9 +1501,9 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { case SHL: case SRA: case SRL: { - const IntInit *LHSi = dyn_cast_or_null( + const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); - const IntInit *RHSi = dyn_cast_or_null( + const auto *RHSi = dyn_cast_or_null( RHS->convertInitializerTo(IntRecTy::get(getRecordKeeper()))); if (LHSi && RHSi) { int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue(); @@ -1643,7 +1642,7 @@ static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd, const Init *NewArg; const StringInit *ArgName = MHSd->getArgName(i); - if (const DagInit *Argd = dyn_cast(Arg)) + if (const auto *Argd = dyn_cast(Arg)) NewArg = ForeachDagApply(LHS, Argd, RHS, CurRec); else NewArg = ItemApply(LHS, Arg, RHS, CurRec); @@ -1662,10 +1661,10 @@ static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd, static const Init *ForeachHelper(const Init *LHS, const Init *MHS, const Init *RHS, const RecTy *Type, const Record *CurRec) { - if (const DagInit *MHSd = dyn_cast(MHS)) + if (const auto *MHSd = dyn_cast(MHS)) return ForeachDagApply(LHS, MHSd, RHS, CurRec); - if (const ListInit *MHSl = dyn_cast(MHS)) { + if (const auto *MHSl = dyn_cast(MHS)) { SmallVector NewList(MHSl->begin(), MHSl->end()); for (const Init *&Item : NewList) { @@ -1684,14 +1683,14 @@ static const Init *ForeachHelper(const Init *LHS, const Init *MHS, static const Init *FilterHelper(const Init *LHS, const Init *MHS, const Init *RHS, const RecTy *Type, const Record *CurRec) { - if (const ListInit *MHSl = dyn_cast(MHS)) { + if (const auto *MHSl = dyn_cast(MHS)) { SmallVector NewList; for (const Init *Item : MHSl->getValues()) { const Init *Include = ItemApply(LHS, Item, RHS, CurRec); if (!Include) return nullptr; - if (const IntInit *IncludeInt = + if (const auto *IncludeInt = dyn_cast_or_null(Include->convertInitializerTo( IntRecTy::get(LHS->getRecordKeeper())))) { if (IncludeInt->getValue()) @@ -1710,17 +1709,17 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { RecordKeeper &RK = getRecordKeeper(); switch (getOpcode()) { case SUBST: { - const DefInit *LHSd = dyn_cast(LHS); - const VarInit *LHSv = dyn_cast(LHS); - const StringInit *LHSs = dyn_cast(LHS); + const auto *LHSd = dyn_cast(LHS); + const auto *LHSv = dyn_cast(LHS); + const auto *LHSs = dyn_cast(LHS); - const DefInit *MHSd = dyn_cast(MHS); - const VarInit *MHSv = dyn_cast(MHS); - const StringInit *MHSs = dyn_cast(MHS); + const auto *MHSd = dyn_cast(MHS); + const auto *MHSv = dyn_cast(MHS); + const auto *MHSs = dyn_cast(MHS); - const DefInit *RHSd = dyn_cast(RHS); - const VarInit *RHSv = dyn_cast(RHS); - const StringInit *RHSs = dyn_cast(RHS); + const auto *RHSd = dyn_cast(RHS); + const auto *RHSv = dyn_cast(RHS); + const auto *RHSs = dyn_cast(RHS); if (LHSd && MHSd && RHSd) { const Record *Val = RHSd->getDef(); @@ -1766,7 +1765,7 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case IF: { - if (const IntInit *LHSi = dyn_cast_or_null( + if (const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) { if (LHSi->getValue()) return MHS; @@ -1776,8 +1775,8 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case DAG: { - const ListInit *MHSl = dyn_cast(MHS); - const ListInit *RHSl = dyn_cast(RHS); + const auto *MHSl = dyn_cast(MHS); + const auto *RHSl = dyn_cast(RHS); bool MHSok = MHSl || isa(MHS); bool RHSok = RHSl || isa(RHS); @@ -1791,7 +1790,7 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { const Init *Node = MHSl ? MHSl->getElement(i) : UnsetInit::get(RK); const Init *Name = RHSl ? RHSl->getElement(i) : UnsetInit::get(RK); if (!isa(Name) && !isa(Name)) - return const_cast(this); + return this; Children.emplace_back(Node, dyn_cast(Name)); } return DagInit::get(LHS, nullptr, Children); @@ -1800,9 +1799,9 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case RANGE: { - auto *LHSi = dyn_cast(LHS); - auto *MHSi = dyn_cast(MHS); - auto *RHSi = dyn_cast(RHS); + const auto *LHSi = dyn_cast(LHS); + const auto *MHSi = dyn_cast(MHS); + const auto *RHSi = dyn_cast(RHS); if (!LHSi || !MHSi || !RHSi) break; @@ -1828,9 +1827,9 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case SUBSTR: { - const StringInit *LHSs = dyn_cast(LHS); - const IntInit *MHSi = dyn_cast(MHS); - const IntInit *RHSi = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *MHSi = dyn_cast(MHS); + const auto *RHSi = dyn_cast(RHS); if (LHSs && MHSi && RHSi) { int64_t StringSize = LHSs->getValue().size(); int64_t Start = MHSi->getValue(); @@ -1849,9 +1848,9 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case FIND: { - const StringInit *LHSs = dyn_cast(LHS); - const StringInit *MHSs = dyn_cast(MHS); - const IntInit *RHSi = dyn_cast(RHS); + const auto *LHSs = dyn_cast(LHS); + const auto *MHSs = dyn_cast(MHS); + const auto *RHSi = dyn_cast(RHS); if (LHSs && MHSs && RHSi) { int64_t SourceSize = LHSs->getValue().size(); int64_t Start = RHSi->getValue(); @@ -1869,7 +1868,7 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case SETDAGARG: { - const DagInit *Dag = dyn_cast(LHS); + const auto *Dag = dyn_cast(LHS); if (Dag && isa(MHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, MHS, Error); @@ -1887,7 +1886,7 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } case SETDAGNAME: { - const DagInit *Dag = dyn_cast(LHS); + const auto *Dag = dyn_cast(LHS); if (Dag && isa(MHS)) { std::string Error; auto ArgNo = getDagArgNoByKey(Dag, MHS, Error); @@ -1905,14 +1904,14 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { } } - return const_cast(this); + return this; } const Init *TernOpInit::resolveReferences(Resolver &R) const { const Init *lhs = LHS->resolveReferences(R); if (getOpcode() == IF && lhs != LHS) { - if (const IntInit *Value = dyn_cast_or_null( + if (const auto *Value = dyn_cast_or_null( lhs->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) { // Short-circuit if (Value->getValue()) @@ -1996,7 +1995,7 @@ void FoldOpInit::Profile(FoldingSetNodeID &ID) const { } const Init *FoldOpInit::Fold(const Record *CurRec) const { - if (const ListInit *LI = dyn_cast(List)) { + if (const auto *LI = dyn_cast(List)) { const Init *Accum = Start; for (const Init *Elt : *LI) { MapResolver R(CurRec); @@ -2025,7 +2024,7 @@ const Init *FoldOpInit::resolveReferences(Resolver &R) const { } const Init *FoldOpInit::getBit(unsigned Bit) const { - return VarBitInit::get(const_cast(this), Bit); + return VarBitInit::get(this, Bit); } std::string FoldOpInit::getAsString() const { @@ -2061,7 +2060,7 @@ void IsAOpInit::Profile(FoldingSetNodeID &ID) const { } const Init *IsAOpInit::Fold() const { - if (const TypedInit *TI = dyn_cast(Expr)) { + if (const auto *TI = dyn_cast(Expr)) { // Is the expression type known to be (a subclass of) the desired type? if (TI->getType()->typeIsConvertibleTo(CheckType)) return IntInit::get(getRecordKeeper(), 1); @@ -2088,7 +2087,7 @@ const Init *IsAOpInit::resolveReferences(Resolver &R) const { } const Init *IsAOpInit::getBit(unsigned Bit) const { - return VarBitInit::get(const_cast(this), Bit); + return VarBitInit::get(this, Bit); } std::string IsAOpInit::getAsString() const { @@ -2124,7 +2123,7 @@ void ExistsOpInit::Profile(FoldingSetNodeID &ID) const { } const Init *ExistsOpInit::Fold(const Record *CurRec, bool IsFinal) const { - if (const StringInit *Name = dyn_cast(Expr)) { + if (const auto *Name = dyn_cast(Expr)) { // Look up all defined records to see if we can find one. const Record *D = CheckType->getRecordKeeper().getDef(Name->getValue()); if (D) { @@ -2140,7 +2139,7 @@ const Init *ExistsOpInit::Fold(const Record *CurRec, bool IsFinal) const { if (Name == CurRec->getNameInit() || (Anonymous && Name == Anonymous->getNameInit())) { if (!IsFinal) - return const_cast(this); + return this; // No doubt that there exists a record, so we should check if types are // compatible. @@ -2163,7 +2162,7 @@ const Init *ExistsOpInit::resolveReferences(Resolver &R) const { } const Init *ExistsOpInit::getBit(unsigned Bit) const { - return VarBitInit::get(const_cast(this), Bit); + return VarBitInit::get(this, Bit); } std::string ExistsOpInit::getAsString() const { @@ -2173,7 +2172,7 @@ std::string ExistsOpInit::getAsString() const { } const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const { - if (const RecordRecTy *RecordType = dyn_cast(getType())) { + if (const auto *RecordType = dyn_cast(getType())) { for (const Record *Rec : RecordType->getClasses()) { if (const RecordVal *Field = Rec->getValue(FieldName)) return Field->getType(); @@ -2184,18 +2183,18 @@ const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const { const Init *TypedInit::convertInitializerTo(const RecTy *Ty) const { if (getType() == Ty || getType()->typeIsA(Ty)) - return const_cast(this); + return this; if (isa(getType()) && isa(Ty) && cast(Ty)->getNumBits() == 1) - return BitsInit::get(getRecordKeeper(), {const_cast(this)}); + return BitsInit::get(getRecordKeeper(), {this}); return nullptr; } const Init * TypedInit::convertInitializerBitRange(ArrayRef Bits) const { - const BitsRecTy *T = dyn_cast(getType()); + const auto *T = dyn_cast(getType()); if (!T) return nullptr; // Cannot subscript a non-bits variable. unsigned NumBits = T->getNumBits(); @@ -2205,7 +2204,7 @@ TypedInit::convertInitializerBitRange(ArrayRef Bits) const { if (Bit >= NumBits) return nullptr; - NewBits.push_back(VarBitInit::get(const_cast(this), Bit)); + NewBits.push_back(VarBitInit::get(this, Bit)); } return BitsInit::get(getRecordKeeper(), NewBits); } @@ -2213,7 +2212,7 @@ TypedInit::convertInitializerBitRange(ArrayRef Bits) const { const Init *TypedInit::getCastTo(const RecTy *Ty) const { // Handle the common case quickly if (getType() == Ty || getType()->typeIsA(Ty)) - return const_cast(this); + return this; if (const Init *Converted = convertInitializerTo(Ty)) { assert(!isa(Converted) || @@ -2224,8 +2223,7 @@ const Init *TypedInit::getCastTo(const RecTy *Ty) const { if (!getType()->typeIsConvertibleTo(Ty)) return nullptr; - return UnOpInit::get(UnOpInit::CAST, const_cast(this), Ty) - ->Fold(nullptr); + return UnOpInit::get(UnOpInit::CAST, this, Ty)->Fold(nullptr); } const VarInit *VarInit::get(StringRef VN, const RecTy *T) { @@ -2242,14 +2240,14 @@ const VarInit *VarInit::get(const Init *VN, const RecTy *T) { } StringRef VarInit::getName() const { - const StringInit *NameString = cast(getNameInit()); + const auto *NameString = cast(getNameInit()); return NameString->getValue(); } const Init *VarInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) - return const_cast(this); - return VarBitInit::get(const_cast(this), Bit); + return this; + return VarBitInit::get(this, Bit); } const Init *VarInit::resolveReferences(Resolver &R) const { @@ -2284,7 +2282,7 @@ DefInit::DefInit(const Record *D) const Init *DefInit::convertInitializerTo(const RecTy *Ty) const { if (auto *RRT = dyn_cast(Ty)) if (getType()->typeIsConvertibleTo(RRT)) - return const_cast(this); + return this; return nullptr; } @@ -2396,7 +2394,7 @@ const Init *VarDefInit::resolveReferences(Resolver &R) const { NewArgs.reserve(args_size()); for (const ArgumentInit *Arg : args()) { - auto *NewArg = cast(Arg->resolveReferences(UR)); + const auto *NewArg = cast(Arg->resolveReferences(UR)); NewArgs.push_back(NewArg); Changed |= NewArg != Arg; } @@ -2444,8 +2442,8 @@ const FieldInit *FieldInit::get(const Init *R, const StringInit *FN) { const Init *FieldInit::getBit(unsigned Bit) const { if (getType() == BitRecTy::get(getRecordKeeper())) - return const_cast(this); - return VarBitInit::get(const_cast(this), Bit); + return this; + return VarBitInit::get(this, Bit); } const Init *FieldInit::resolveReferences(Resolver &R) const { @@ -2456,7 +2454,7 @@ const Init *FieldInit::resolveReferences(Resolver &R) const { } const Init *FieldInit::Fold(const Record *CurRec) const { - if (const DefInit *DI = dyn_cast(Rec)) { + if (const auto *DI = dyn_cast(Rec)) { const Record *Def = DI->getDef(); if (Def == CurRec) PrintFatalError(CurRec->getLoc(), @@ -2467,11 +2465,11 @@ const Init *FieldInit::Fold(const Record *CurRec) const { if (FieldVal->isConcrete()) return FieldVal; } - return const_cast(this); + return this; } bool FieldInit::isConcrete() const { - if (const DefInit *DI = dyn_cast(Rec)) { + if (const auto *DI = dyn_cast(Rec)) { const Init *FieldVal = DI->getDef()->getValue(FieldName)->getValue(); return FieldVal->isConcrete(); } @@ -2557,12 +2555,12 @@ const Init *CondOpInit::Fold(const Record *CurRec) const { const Init *Cond = getCond(i); const Init *Val = getVal(i); - if (const IntInit *CondI = dyn_cast_or_null( + if (const auto *CondI = dyn_cast_or_null( Cond->convertInitializerTo(IntRecTy::get(RK)))) { if (CondI->getValue()) return Val->convertInitializerTo(getValType()); } else { - return const_cast(this); + return this; } } @@ -2609,7 +2607,7 @@ std::string CondOpInit::getAsString() const { } const Init *CondOpInit::getBit(unsigned Bit) const { - return VarBitInit::get(const_cast(this), Bit); + return VarBitInit::get(this, Bit); } static void ProfileDagInit(FoldingSetNodeID &ID, const Init *V, @@ -2675,7 +2673,7 @@ void DagInit::Profile(FoldingSetNodeID &ID) const { } const Record *DagInit::getOperatorAsDef(ArrayRef Loc) const { - if (const DefInit *DefI = dyn_cast(Val)) + if (const auto *DefI = dyn_cast(Val)) return DefI->getDef(); PrintFatalError(Loc, "Expected record as operator"); return nullptr; @@ -2756,7 +2754,7 @@ StringRef RecordVal::getName() const { std::string RecordVal::getPrintType() const { if (getType() == StringRecTy::get(getRecordKeeper())) { - if (auto *StrInit = dyn_cast(Value)) { + if (const auto *StrInit = dyn_cast(Value)) { if (StrInit->hasCodeFormat()) return "code"; else @@ -2775,7 +2773,7 @@ bool RecordVal::setValue(const Init *V) { if (Value) { assert(!isa(Value) || cast(Value)->getType()->typeIsA(getType())); - if (const BitsRecTy *BTy = dyn_cast(getType())) { + if (const auto *BTy = dyn_cast(getType())) { if (!isa(Value)) { SmallVector Bits; Bits.reserve(BTy->getNumBits()); @@ -2800,7 +2798,7 @@ bool RecordVal::setValue(const Init *V, SMLoc NewLoc) { if (Value) { assert(!isa(Value) || cast(Value)->getType()->typeIsA(getType())); - if (const BitsRecTy *BTy = dyn_cast(getType())) { + if (const auto *BTy = dyn_cast(getType())) { if (!isa(Value)) { SmallVector Bits; Bits.reserve(BTy->getNumBits()); @@ -2841,7 +2839,7 @@ void Record::updateClassLoc(SMLoc Loc) { void Record::checkName() { // Ensure the record name has string type. - const TypedInit *TypedName = cast(Name); + const auto *TypedName = cast(Name); if (!isa(TypedName->getType())) PrintFatalError(getLoc(), Twine("Record name '") + Name->getAsString() + "' is not a string!"); @@ -2926,7 +2924,7 @@ void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { const Init *VR = V->resolveReferences(R); if (Value.setValue(VR)) { std::string Type; - if (const TypedInit *VRT = dyn_cast(VR)) + if (const auto *VRT = dyn_cast(VR)) Type = (Twine("of type '") + VRT->getType()->getAsString() + "' ").str(); PrintFatalError( @@ -3033,7 +3031,7 @@ Record::getValueAsOptionalString(StringRef FieldName) const { if (isa(R->getValue())) return std::nullopt; - if (const StringInit *SI = dyn_cast(R->getValue())) + if (const auto *SI = dyn_cast(R->getValue())) return SI->getValue(); PrintFatalError(getLoc(), @@ -3047,7 +3045,7 @@ const BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const BitsInit *BI = dyn_cast(R->getValue())) + if (const auto *BI = dyn_cast(R->getValue())) return BI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a bits value"); @@ -3059,7 +3057,7 @@ const ListInit *Record::getValueAsListInit(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const ListInit *LI = dyn_cast(R->getValue())) + if (const auto *LI = dyn_cast(R->getValue())) return LI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' exists but does not have a list value"); @@ -3070,7 +3068,7 @@ Record::getValueAsListOfDefs(StringRef FieldName) const { const ListInit *List = getValueAsListInit(FieldName); std::vector Defs; for (const Init *I : List->getValues()) { - if (const DefInit *DI = dyn_cast(I)) + if (const auto *DI = dyn_cast(I)) Defs.push_back(DI->getDef()); else PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + @@ -3086,7 +3084,7 @@ int64_t Record::getValueAsInt(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const IntInit *II = dyn_cast(R->getValue())) + if (const auto *II = dyn_cast(R->getValue())) return II->getValue(); PrintFatalError(getLoc(), Twine("Record `") + getName() + "', field `" + FieldName + @@ -3099,7 +3097,7 @@ Record::getValueAsListOfInts(StringRef FieldName) const { const ListInit *List = getValueAsListInit(FieldName); std::vector Ints; for (const Init *I : List->getValues()) { - if (const IntInit *II = dyn_cast(I)) + if (const auto *II = dyn_cast(I)) Ints.push_back(II->getValue()); else PrintFatalError(getLoc(), @@ -3115,7 +3113,7 @@ Record::getValueAsListOfStrings(StringRef FieldName) const { const ListInit *List = getValueAsListInit(FieldName); std::vector Strings; for (const Init *I : List->getValues()) { - if (const StringInit *SI = dyn_cast(I)) + if (const auto *SI = dyn_cast(I)) Strings.push_back(SI->getValue()); else PrintFatalError(getLoc(), @@ -3132,7 +3130,7 @@ const Record *Record::getValueAsDef(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const DefInit *DI = dyn_cast(R->getValue())) + if (const auto *DI = dyn_cast(R->getValue())) return DI->getDef(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a def initializer!"); @@ -3144,7 +3142,7 @@ const Record *Record::getValueAsOptionalDef(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const DefInit *DI = dyn_cast(R->getValue())) + if (const auto *DI = dyn_cast(R->getValue())) return DI->getDef(); if (isa(R->getValue())) return nullptr; @@ -3158,7 +3156,7 @@ bool Record::getValueAsBit(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const BitInit *BI = dyn_cast(R->getValue())) + if (const auto *BI = dyn_cast(R->getValue())) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); @@ -3175,7 +3173,7 @@ bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const { return false; } Unset = false; - if (const BitInit *BI = dyn_cast(R->getValue())) + if (const auto *BI = dyn_cast(R->getValue())) return BI->getValue(); PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a bit initializer!"); @@ -3187,7 +3185,7 @@ const DagInit *Record::getValueAsDag(StringRef FieldName) const { PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - if (const DagInit *DI = dyn_cast(R->getValue())) + if (const auto *DI = dyn_cast(R->getValue())) return DI; PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + "' does not have a dag initializer!"); diff --git a/llvm/lib/TableGen/SetTheory.cpp b/llvm/lib/TableGen/SetTheory.cpp index 47718cc8b0e7..ac7ae2cbaed5 100644 --- a/llvm/lib/TableGen/SetTheory.cpp +++ b/llvm/lib/TableGen/SetTheory.cpp @@ -296,7 +296,7 @@ void SetTheory::evaluate(const Init *Expr, RecSet &Elts, ArrayRef Loc) { const auto *DagExpr = dyn_cast(Expr); if (!DagExpr) PrintFatalError(Loc, "Invalid set element: " + Expr->getAsString()); - const DefInit *OpInit = dyn_cast(DagExpr->getOperator()); + const auto *OpInit = dyn_cast(DagExpr->getOperator()); if (!OpInit) PrintFatalError(Loc, "Bad set expression: " + Expr->getAsString()); auto I = Operators.find(OpInit->getDef()->getName()); diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 97a7e680e0c3..f315557f38aa 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -68,12 +68,12 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const { } // end namespace llvm static bool checkBitsConcrete(Record &R, const RecordVal &RV) { - const BitsInit *BV = cast(RV.getValue()); + const auto *BV = cast(RV.getValue()); for (unsigned i = 0, e = BV->getNumBits(); i != e; ++i) { const Init *Bit = BV->getBit(i); bool IsReference = false; - if (auto VBI = dyn_cast(Bit)) { - if (auto VI = dyn_cast(VBI->getBitVar())) { + if (const auto *VBI = dyn_cast(Bit)) { + if (const auto *VI = dyn_cast(VBI->getBitVar())) { if (R.getValue(VI->getName())) IsReference = true; } @@ -117,7 +117,7 @@ static const Init *QualifyName(Record &CurRec, const Init *Name) { StringInit::get(RK, CurRec.isMultiClass() ? "::" : ":")); NewName = BinOpInit::getStrConcat(NewName, Name); - if (const BinOpInit *BinOp = dyn_cast(NewName)) + if (const auto *BinOp = dyn_cast(NewName)) NewName = BinOp->Fold(&CurRec); return NewName; } @@ -186,7 +186,7 @@ const Init *TGVarScope::getVar(RecordKeeper &Records, case SK_ForeachLoop: { // The variable is a loop iterator? if (CurLoop->IterVar) { - const VarInit *IterVar = dyn_cast(CurLoop->IterVar); + const auto *IterVar = dyn_cast(CurLoop->IterVar); if (IterVar && IterVar->getNameInit() == Name) return IterVar; } @@ -243,7 +243,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const Init *ValName, // Do not allow assignments like 'X = X'. This will just cause infinite loops // in the resolution machinery. if (BitList.empty()) - if (const VarInit *VI = dyn_cast(V)) + if (const auto *VI = dyn_cast(V)) if (VI->getNameInit() == ValName && !AllowSelfAssignment) return Error(Loc, "Recursion / self-assignment forbidden"); @@ -252,7 +252,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const Init *ValName, // initializer. // if (!BitList.empty()) { - const BitsInit *CurVal = dyn_cast(RV->getValue()); + const auto *CurVal = dyn_cast(RV->getValue()); if (!CurVal) return Error(Loc, "Value '" + ValName->getAsUnquotedString() + "' is not a bits type"); @@ -282,10 +282,10 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const Init *ValName, if (OverrideDefLoc ? RV->setValue(V, Loc) : RV->setValue(V)) { std::string InitType; - if (const BitsInit *BI = dyn_cast(V)) + if (const auto *BI = dyn_cast(V)) InitType = (Twine("' of type bit initializer with length ") + Twine(BI->getNumBits())).str(); - else if (const TypedInit *TI = dyn_cast(V)) + else if (const auto *TI = dyn_cast(V)) InitType = (Twine("' of type '") + TI->getType()->getAsString()).str(); return Error(Loc, "Field '" + ValName->getAsUnquotedString() + "' of type '" + RV->getType()->getAsString() + @@ -437,7 +437,7 @@ bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs, // the condition here. We want to defer final resolution of the arms // until the resulting records are finalized. // e.g. !if(!exists("__does_not_exist__"), [1], []) - if (auto *TI = dyn_cast(List); + if (const auto *TI = dyn_cast(List); TI && TI->getOpcode() == TernOpInit::IF && Final) { const Init *OldLHS = TI->getLHS(); R.setFinal(true); @@ -454,7 +454,7 @@ bool TGParser::resolve(const ForeachLoop &Loop, SubstStack &Substs, ->Fold(nullptr); } - auto LI = dyn_cast(List); + const auto *LI = dyn_cast(List); if (!LI) { if (!Final) { Dest->emplace_back(std::make_unique(Loop.Loc, Loop.IterVar, @@ -838,7 +838,7 @@ const TypedInit *TGParser::ParseSliceElement(Record *CurRec) { auto *CurVal = ParseValue(CurRec); if (!CurVal) return nullptr; - auto *LHS = cast(CurVal); + const auto *LHS = cast(CurVal); const TypedInit *RHS = nullptr; switch (Lex.getCode()) { @@ -916,7 +916,7 @@ const TypedInit *TGParser::ParseSliceElements(Record *CurRec, bool Single) { return nullptr; auto *CurValTy = CurVal->getType(); - if (auto *ListValTy = dyn_cast(CurValTy)) { + if (const auto *ListValTy = dyn_cast(CurValTy)) { if (!isa(ListValTy->getElementType())) { Error(LHSLoc, "expected list, got " + Twine(ListValTy->getAsString())); @@ -977,7 +977,7 @@ bool TGParser::ParseRangePiece(SmallVectorImpl &Ranges, if (!CurVal) CurVal = ParseValue(nullptr); - const IntInit *II = dyn_cast_or_null(CurVal); + const auto *II = dyn_cast_or_null(CurVal); if (!II) return TokError("expected integer or bitrange"); @@ -997,7 +997,7 @@ bool TGParser::ParseRangePiece(SmallVectorImpl &Ranges, Lex.Lex(); // eat const Init *I_End = ParseValue(nullptr); - const IntInit *II_End = dyn_cast_or_null(I_End); + const auto *II_End = dyn_cast_or_null(I_End); if (!II_End) { TokError("expected integer value as end of range"); return true; @@ -1167,7 +1167,7 @@ const Init *TGParser::ParseIDValue(Record *CurRec, const StringInit *Name, if (const Init *I = Records.getGlobal(Name->getValue())) { // Add a reference to the global if it's a record. if (TrackReferenceLocs) { - if (auto *Def = dyn_cast(I)) + if (const auto *Def = dyn_cast(I)) Def->getDef()->appendReferenceLoc(NameLoc); } return I; @@ -1301,10 +1301,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!LHS) return nullptr; if (Code == UnOpInit::EMPTY || Code == UnOpInit::SIZE) { - const ListInit *LHSl = dyn_cast(LHS); - const StringInit *LHSs = dyn_cast(LHS); - const DagInit *LHSd = dyn_cast(LHS); - const TypedInit *LHSt = dyn_cast(LHS); + const auto *LHSl = dyn_cast(LHS); + const auto *LHSs = dyn_cast(LHS); + const auto *LHSd = dyn_cast(LHS); + const auto *LHSt = dyn_cast(LHS); if (!LHSl && !LHSs && !LHSd && !LHSt) { TokError("expected string, list, or dag type argument in unary operator"); return nullptr; @@ -1319,8 +1319,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL || Code == UnOpInit::LISTFLATTEN) { - const ListInit *LHSl = dyn_cast(LHS); - const TypedInit *LHSt = dyn_cast(LHS); + const auto *LHSl = dyn_cast(LHS); + const auto *LHSt = dyn_cast(LHS); if (!LHSl && !LHSt) { TokError("expected list type argument in unary operator"); return nullptr; @@ -1340,7 +1340,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { Code == UnOpInit::HEAD || Code == UnOpInit::LISTFLATTEN; if (LHSl) { const Init *Item = LHSl->getElement(0); - const TypedInit *Itemt = dyn_cast(Item); + const auto *Itemt = dyn_cast(Item); if (!Itemt) { TokError("untyped list element in unary operator"); return nullptr; @@ -1349,14 +1349,14 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { : ListRecTy::get(Itemt->getType()); } else { assert(LHSt && "expected list type argument in unary operator"); - const ListRecTy *LType = dyn_cast(LHSt->getType()); + const auto *LType = dyn_cast(LHSt->getType()); Type = UseElementType ? LType->getElementType() : LType; } // for !listflatten, we expect a list of lists, but also support a list of // non-lists, where !listflatten will be a NOP. if (Code == UnOpInit::LISTFLATTEN) { - const ListRecTy *InnerListTy = dyn_cast(Type); + const auto *InnerListTy = dyn_cast(Type); if (InnerListTy) { // listflatten will convert list> to list. Type = ListRecTy::get(InnerListTy->getElementType()); @@ -1417,13 +1417,13 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!Expr) return nullptr; - const TypedInit *ExprType = dyn_cast(Expr); + const auto *ExprType = dyn_cast(Expr); if (!ExprType) { Error(ExprLoc, "expected string type argument in !exists operator"); return nullptr; } - const RecordRecTy *RecType = dyn_cast(ExprType->getType()); + const auto *RecType = dyn_cast(ExprType->getType()); if (RecType) { Error(ExprLoc, "expected string type argument in !exists operator, please " @@ -1431,7 +1431,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - const StringRecTy *SType = dyn_cast(ExprType->getType()); + const auto *SType = dyn_cast(ExprType->getType()); if (!SType) { Error(ExprLoc, "expected string type argument in !exists operator"); return nullptr; @@ -1595,7 +1595,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { InitList.push_back(ParseValue(CurRec, ArgType)); if (!InitList.back()) return nullptr; - const TypedInit *InitListBack = dyn_cast(InitList.back()); + const auto *InitListBack = dyn_cast(InitList.back()); if (!InitListBack) { Error(OpLoc, Twine("expected value to be a typed value, got '" + InitList.back()->getAsString() + "'")); @@ -1806,7 +1806,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!Args.back()) return nullptr; - const TypedInit *ArgBack = dyn_cast(Args.back()); + const auto *ArgBack = dyn_cast(Args.back()); if (!ArgBack) { Error(OpLoc, Twine("expected value to be a typed value, got '" + Args.back()->getAsString() + "'")); @@ -1847,8 +1847,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { const Init *LHS, *MHS, *RHS; auto ArgCount = Args.size(); assert(ArgCount >= 1); - auto *Arg0 = cast(Args[0]); - auto *Arg0Ty = Arg0->getType(); + const auto *Arg0 = cast(Args[0]); + const auto *Arg0Ty = Arg0->getType(); if (ArgCount == 1) { if (isa(Arg0Ty)) { // (0, !size(arg), 1) @@ -1865,13 +1865,13 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } } else { assert(isa(Arg0Ty)); - auto *Arg1 = cast(Args[1]); + const auto *Arg1 = cast(Args[1]); assert(isa(Arg1->getType())); LHS = Arg0; MHS = Arg1; if (ArgCount == 3) { // (start, end, step) - auto *Arg2 = cast(Args[2]); + const auto *Arg2 = cast(Args[2]); assert(isa(Arg2->getType())); RHS = Arg2; } else @@ -1953,7 +1953,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { switch (LexCode) { default: llvm_unreachable("Unhandled code!"); case tgtok::XDag: { - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { Error(MHSLoc, "could not determine type of the child list in !dag"); return nullptr; @@ -1964,7 +1964,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { return nullptr; } - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { Error(RHSLoc, "could not determine type of the name list in !dag"); return nullptr; @@ -1986,16 +1986,16 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { const RecTy *MHSTy = nullptr; const RecTy *RHSTy = nullptr; - if (const TypedInit *MHSt = dyn_cast(MHS)) + if (const auto *MHSt = dyn_cast(MHS)) MHSTy = MHSt->getType(); - if (const BitsInit *MHSbits = dyn_cast(MHS)) + if (const auto *MHSbits = dyn_cast(MHS)) MHSTy = BitsRecTy::get(Records, MHSbits->getNumBits()); if (isa(MHS)) MHSTy = BitRecTy::get(Records); - if (const TypedInit *RHSt = dyn_cast(RHS)) + if (const auto *RHSt = dyn_cast(RHS)) RHSTy = RHSt->getType(); - if (const BitsInit *RHSbits = dyn_cast(RHS)) + if (const auto *RHSbits = dyn_cast(RHS)) RHSTy = BitsRecTy::get(Records, RHSbits->getNumBits()); if (isa(RHS)) RHSTy = BitRecTy::get(Records); @@ -2020,7 +2020,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSubst: { - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); if (!RHSt) { TokError("could not get type for !subst"); return nullptr; @@ -2029,7 +2029,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSetDagArg: { - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt || !isa(MHSt->getType())) { Error(MHSLoc, Twine("expected integer index or string name, got ") + (MHSt ? ("type '" + MHSt->getType()->getAsString()) @@ -2040,7 +2040,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { break; } case tgtok::XSetDagName: { - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt || !isa(MHSt->getType())) { Error(MHSLoc, Twine("expected integer index or string name, got ") + (MHSt ? ("type '" + MHSt->getType()->getAsString()) @@ -2048,7 +2048,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { "'"); return nullptr; } - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); // The name could be a string or unset. if (RHSt && !isa(RHSt->getType())) { Error(RHSLoc, Twine("expected string or unset name, got type '") + @@ -2082,7 +2082,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!StartUntyped) return nullptr; - const TypedInit *Start = dyn_cast(StartUntyped); + const auto *Start = dyn_cast(StartUntyped); if (!Start) { TokError(Twine("could not get type of !foldl start: '") + StartUntyped->getAsString() + "'"); @@ -2098,14 +2098,14 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!ListUntyped) return nullptr; - const TypedInit *List = dyn_cast(ListUntyped); + const auto *List = dyn_cast(ListUntyped); if (!List) { TokError(Twine("could not get type of !foldl list: '") + ListUntyped->getAsString() + "'"); return nullptr; } - const ListRecTy *ListType = dyn_cast(List->getType()); + const auto *ListType = dyn_cast(List->getType()); if (!ListType) { TokError(Twine("!foldl list must be a list, but is of type '") + List->getType()->getAsString()); @@ -2174,7 +2174,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { if (!ExprUntyped) return nullptr; - const TypedInit *Expr = dyn_cast(ExprUntyped); + const auto *Expr = dyn_cast(ExprUntyped); if (!Expr) { TokError("could not get type of !foldl expression"); return nullptr; @@ -2280,7 +2280,7 @@ const Init *TGParser::ParseOperationSubstr(Record *CurRec, Type->getAsString() + "'"); } - const TypedInit *LHSt = dyn_cast(LHS); + const auto *LHSt = dyn_cast(LHS); if (!LHSt && !isa(LHS)) { TokError("could not determine type of the string in !substr"); return nullptr; @@ -2291,7 +2291,7 @@ const Init *TGParser::ParseOperationSubstr(Record *CurRec, return nullptr; } - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { TokError("could not determine type of the start position in !substr"); return nullptr; @@ -2303,7 +2303,7 @@ const Init *TGParser::ParseOperationSubstr(Record *CurRec, } if (RHS) { - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { TokError("could not determine type of the length in !substr"); return nullptr; @@ -2369,7 +2369,7 @@ const Init *TGParser::ParseOperationFind(Record *CurRec, Type->getAsString() + "'"); } - const TypedInit *LHSt = dyn_cast(LHS); + const auto *LHSt = dyn_cast(LHS); if (!LHSt && !isa(LHS)) { TokError("could not determine type of the source string in !find"); return nullptr; @@ -2380,7 +2380,7 @@ const Init *TGParser::ParseOperationFind(Record *CurRec, return nullptr; } - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt && !isa(MHS)) { TokError("could not determine type of the target string in !find"); return nullptr; @@ -2392,7 +2392,7 @@ const Init *TGParser::ParseOperationFind(Record *CurRec, } if (RHS) { - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); if (!RHSt && !isa(RHS)) { TokError("could not determine type of the start position in !find"); return nullptr; @@ -2450,7 +2450,7 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } - const TypedInit *MHSt = dyn_cast(MHS); + const auto *MHSt = dyn_cast(MHS); if (!MHSt) { TokError("could not get type of !foreach/!filter list or dag"); return nullptr; @@ -2460,10 +2460,10 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, const RecTy *ExprEltType = nullptr; bool IsDAG = false; - if (const ListRecTy *InListTy = dyn_cast(MHSt->getType())) { + if (const auto *InListTy = dyn_cast(MHSt->getType())) { InEltType = InListTy->getElementType(); if (ItemType) { - if (const ListRecTy *OutListTy = dyn_cast(ItemType)) { + if (const auto *OutListTy = dyn_cast(ItemType)) { ExprEltType = (Operation == tgtok::XForEach) ? OutListTy->getElementType() : IntRecTy::get(Records); @@ -2475,7 +2475,7 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } } - } else if (const DagRecTy *InDagTy = dyn_cast(MHSt->getType())) { + } else if (const auto *InDagTy = dyn_cast(MHSt->getType())) { if (Operation == tgtok::XFilter) { TokError("!filter must have a list argument"); return nullptr; @@ -2520,7 +2520,7 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, const RecTy *OutType = InEltType; if (Operation == tgtok::XForEach && !IsDAG) { - const TypedInit *RHSt = dyn_cast(RHS); + const auto *RHSt = dyn_cast(RHS); if (!RHSt) { TokError("could not get type of !foreach result expression"); return nullptr; @@ -2585,9 +2585,9 @@ const Init *TGParser::ParseOperationCond(Record *CurRec, const RecTy *Type = nullptr; for (const Init *V : Val) { const RecTy *VTy = nullptr; - if (const TypedInit *Vt = dyn_cast(V)) + if (const auto *Vt = dyn_cast(V)) VTy = Vt->getType(); - if (const BitsInit *Vbits = dyn_cast(V)) + if (const auto *Vbits = dyn_cast(V)) VTy = BitsRecTy::get(Records, Vbits->getNumBits()); if (isa(V)) VTy = BitRecTy::get(Records); @@ -2754,14 +2754,14 @@ const Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, // if the API was a little more orthogonal. // bits values are allowed to initialize n bits. - if (const BitsInit *BI = dyn_cast(Vals[i])) { + if (const auto *BI = dyn_cast(Vals[i])) { for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) NewBits.push_back(BI->getBit((e - i) - 1)); continue; } // bits can also come from variable initializers. - if (const VarInit *VI = dyn_cast(Vals[i])) { - if (const BitsRecTy *BitsRec = dyn_cast(VI->getType())) { + if (const auto *VI = dyn_cast(Vals[i])) { + if (const auto *BitsRec = dyn_cast(VI->getType())) { for (unsigned i = 0, e = BitsRec->getNumBits(); i != e; ++i) NewBits.push_back(VI->getBit((e - i) - 1)); continue; @@ -2788,7 +2788,7 @@ const Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, const ListRecTy *GivenListTy = nullptr; if (ItemType) { - const ListRecTy *ListType = dyn_cast(ItemType); + const auto *ListType = dyn_cast(ItemType); if (!ListType) { TokError(Twine("Encountered a list when expecting a ") + ItemType->getAsString()); @@ -2825,7 +2825,7 @@ const Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType, // Check elements const RecTy *EltTy = nullptr; for (const Init *V : Vals) { - const TypedInit *TArg = dyn_cast(V); + const auto *TArg = dyn_cast(V); if (TArg) { if (EltTy) { EltTy = resolveTypes(EltTy, TArg->getType()); @@ -2957,13 +2957,13 @@ const Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, break; } case tgtok::l_square: { - auto *LHS = dyn_cast(Result); + const auto *LHS = dyn_cast(Result); if (!LHS) { Error(LHSLoc, "Invalid value, list expected"); return nullptr; } - auto *LHSTy = dyn_cast(LHS->getType()); + const auto *LHSTy = dyn_cast(LHS->getType()); if (!LHSTy) { Error(LHSLoc, "Type '" + Twine(LHS->getType()->getAsString()) + "' is invalid, list expected"); @@ -3009,13 +3009,13 @@ const Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, // Add a reference to this field if we know the record class. if (TrackReferenceLocs) { - if (auto *DI = dyn_cast(Result)) { + if (const auto *DI = dyn_cast(Result)) { const RecordVal *V = DI->getDef()->getValue(FieldName); const_cast(V)->addReferenceLoc(FieldNameLoc); - } else if (auto *TI = dyn_cast(Result)) { - if (auto *RecTy = dyn_cast(TI->getType())) { + } else if (const auto *TI = dyn_cast(Result)) { + if (const auto *RecTy = dyn_cast(TI->getType())) { for (const Record *R : RecTy->getClasses()) - if (auto *RV = R->getValue(FieldName)) + if (const auto *RV = R->getValue(FieldName)) const_cast(RV)->addReferenceLoc(FieldNameLoc); } } @@ -3028,7 +3028,7 @@ const Init *TGParser::ParseValue(Record *CurRec, const RecTy *ItemType, case tgtok::paste: SMLoc PasteLoc = Lex.getLoc(); - const TypedInit *LHS = dyn_cast(Result); + const auto *LHS = dyn_cast(Result); if (!LHS) { Error(PasteLoc, "LHS of paste is not typed!"); return nullptr; @@ -3382,7 +3382,7 @@ TGParser::ParseForeachDeclaration(const Init *&ForeachListValue) { if (!I) return nullptr; - const TypedInit *TI = dyn_cast(I); + const auto *TI = dyn_cast(I); if (TI && isa(TI->getType())) { ForeachListValue = I; IterType = cast(TI->getType())->getElementType(); @@ -4424,7 +4424,7 @@ bool TGParser::CheckTemplateArgValues( RecordVal *Arg = ArgsRec->getValue(ArgName); const RecTy *ArgType = Arg->getType(); - if (const TypedInit *ArgValue = dyn_cast(Value->getValue())) { + if (const auto *ArgValue = dyn_cast(Value->getValue())) { auto *CastValue = ArgValue->getCastTo(ArgType); if (CastValue) { assert((!isa(CastValue) || -- GitLab From e6ada7162e25ab28f6e588fba23f0c11dd1238b5 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Mon, 21 Oct 2024 11:10:50 -0700 Subject: [PATCH 274/511] [regalloc][basic] Change spill weight for optsize funcs (#112960) Change the spill weight calculations for `optsize` functions to remove the block frequency multiplier. For those functions, we do not want to consider the runtime cost of spilling, only the codesize cost. I built a large app with the basic and greedy (default) register allocator enabled. | Regalloc Type | Uncompressed Size Delta | Compressed Size Delta | | - | - | - | | Basic | -303.8 KiB (-0.23%) | -232.0 KiB (-0.39%) | | Greedy | 159.1 KiB (0.12%) | 130.1 KiB (0.22%) | Since I only saw a size win with the basic register allocator, I decided to only change the behavior for that type. --- llvm/include/llvm/CodeGen/CalcSpillWeights.h | 7 +- llvm/include/llvm/CodeGen/LiveIntervals.h | 9 +- llvm/lib/CodeGen/CalcSpillWeights.cpp | 8 +- llvm/lib/CodeGen/LiveIntervals.cpp | 19 +- llvm/lib/CodeGen/RegAllocBasic.cpp | 6 +- .../AArch64/regalloc-spill-weight-basic.ll | 168 ++++++++++++++++++ 6 files changed, 205 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll diff --git a/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/llvm/include/llvm/CodeGen/CalcSpillWeights.h index 41b7f10cfc38..acb8b762efc6 100644 --- a/llvm/include/llvm/CodeGen/CalcSpillWeights.h +++ b/llvm/include/llvm/CodeGen/CalcSpillWeights.h @@ -18,6 +18,7 @@ class LiveIntervals; class MachineBlockFrequencyInfo; class MachineFunction; class MachineLoopInfo; +class ProfileSummaryInfo; class VirtRegMap; /// Normalize the spill weight of a live interval @@ -47,6 +48,7 @@ class VirtRegMap; LiveIntervals &LIS; const VirtRegMap &VRM; const MachineLoopInfo &Loops; + ProfileSummaryInfo *PSI; const MachineBlockFrequencyInfo &MBFI; /// Returns true if Reg of live interval LI is used in instruction with many @@ -56,8 +58,9 @@ class VirtRegMap; public: VirtRegAuxInfo(MachineFunction &MF, LiveIntervals &LIS, const VirtRegMap &VRM, const MachineLoopInfo &Loops, - const MachineBlockFrequencyInfo &MBFI) - : MF(MF), LIS(LIS), VRM(VRM), Loops(Loops), MBFI(MBFI) {} + const MachineBlockFrequencyInfo &MBFI, + ProfileSummaryInfo *PSI = nullptr) + : MF(MF), LIS(LIS), VRM(VRM), Loops(Loops), PSI(PSI), MBFI(MBFI) {} virtual ~VirtRegAuxInfo() = default; diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index 4c45a9676d6b..161bb247a0e9 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -47,6 +47,7 @@ class MachineDominatorTree; class MachineFunction; class MachineInstr; class MachineRegisterInfo; +class ProfileSummaryInfo; class raw_ostream; class TargetInstrInfo; class VirtRegMap; @@ -113,14 +114,18 @@ public: ~LiveIntervals(); /// Calculate the spill weight to assign to a single instruction. + /// If \p PSI is provided the calculation is altered for optsize functions. static float getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, - const MachineInstr &MI); + const MachineInstr &MI, + ProfileSummaryInfo *PSI = nullptr); /// Calculate the spill weight to assign to a single instruction. + /// If \p PSI is provided the calculation is altered for optsize functions. static float getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, - const MachineBasicBlock *MBB); + const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI = nullptr); LiveInterval &getInterval(Register Reg) { if (hasInterval(Reg)) diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index 88ed2291313c..f361c956092e 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -199,8 +199,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, // localLI = COPY other // ... // other = COPY localLI - TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB); - TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB); + TotalWeight += + LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB, PSI); + TotalWeight += + LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB, PSI); NumInstr += 2; } @@ -272,7 +274,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, // Calculate instr weight. bool Reads, Writes; std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg()); - Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI); + Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI, PSI); // Give extra weight to what looks like a loop induction variable update. if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB)) diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 7ddaaaa915ef..21a316cf99a2 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" @@ -37,6 +38,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/ProfileSummary.h" #include "llvm/IR/Statepoint.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" @@ -875,14 +877,23 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const { float LiveIntervals::getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, - const MachineInstr &MI) { - return getSpillWeight(isDef, isUse, MBFI, MI.getParent()); + const MachineInstr &MI, + ProfileSummaryInfo *PSI) { + return getSpillWeight(isDef, isUse, MBFI, MI.getParent(), PSI); } float LiveIntervals::getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, - const MachineBasicBlock *MBB) { - return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB); + const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI) { + float Weight = isDef + isUse; + const auto *MF = MBB->getParent(); + // When optimizing for size we only consider the codesize impact of spilling + // the register, not the runtime impact. + if (PSI && (MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MF, PSI, MBFI))) + return Weight; + return Weight * MBFI->getBlockFreqRelativeToEntryBlock(MBB); } LiveRange::Segment diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index caf9c32a5a34..046784c386e3 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -14,6 +14,7 @@ #include "AllocationOrder.h" #include "RegAllocBase.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -140,6 +141,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false, false) @@ -182,6 +184,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequiredID(MachineDominatorsID); @@ -312,7 +315,8 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { getAnalysis()); VirtRegAuxInfo VRAI( *MF, *LIS, *VRM, getAnalysis().getLI(), - getAnalysis().getMBFI()); + getAnalysis().getMBFI(), + &getAnalysis().getPSI()); VRAI.calculateSpillWeightsAndHints(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI)); diff --git a/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll b/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll new file mode 100644 index 000000000000..5c3bd984087e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc < %s -mtriple=aarch64 -regalloc=basic | FileCheck %s + +; Test that the register allocator behaves differently with minsize functions. + +declare void @foo(i32, ptr) + +define void @optsize(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) minsize { +; CHECK-LABEL: optsize: +; CHECK: // %bb.0: // %bb +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov w23, w5 +; CHECK-NEXT: mov x22, x4 +; CHECK-NEXT: mov x21, x3 +; CHECK-NEXT: mov x20, x2 +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: .LBB0_1: // %bb8 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cbz w19, .LBB0_1 +; CHECK-NEXT: // %bb.2: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmp w19, #39 +; CHECK-NEXT: b.eq .LBB0_6 +; CHECK-NEXT: // %bb.3: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmp w19, #34 +; CHECK-NEXT: b.eq .LBB0_6 +; CHECK-NEXT: // %bb.4: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmp w19, #10 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.5: // %bb9 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str wzr, [x20] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_6: // %bb10 +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: mov w0, w23 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: str wzr, [x22] +; CHECK-NEXT: bl foo +; CHECK-NEXT: b .LBB0_1 +bb: + br label %bb7 + +bb7: ; preds = %bb13, %bb + %phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ] + br label %bb8 + +bb8: ; preds = %bb10, %bb9, %bb8, %bb7 + switch i32 %arg1, label %bb8 [ + i32 10, label %bb9 + i32 1, label %bb16 + i32 0, label %bb13 + i32 39, label %bb10 + i32 34, label %bb10 + ] + +bb9: ; preds = %bb8 + store i32 0, ptr %arg2, align 4 + br label %bb8 + +bb10: ; preds = %bb8, %bb8 + store i32 0, ptr %arg4, align 4 + tail call void @foo(i32 %arg5, ptr %arg3) + br label %bb8 + +bb13: ; preds = %bb8 + %not.arg6 = xor i1 %arg6, true + %spec.select = zext i1 %not.arg6 to i32 + br label %bb7 + +bb16: ; preds = %bb8 + unreachable +} + +define void @optspeed(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) { +; CHECK-LABEL: optspeed: +; CHECK: // %bb.0: // %bb +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov w22, w5 +; CHECK-NEXT: mov x21, x4 +; CHECK-NEXT: mov x20, x3 +; CHECK-NEXT: mov x23, x2 +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_1: // %bb10 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: mov w0, w22 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: str wzr, [x21] +; CHECK-NEXT: bl foo +; CHECK-NEXT: .LBB1_2: // %bb8 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp w19, #33 +; CHECK-NEXT: b.gt .LBB1_6 +; CHECK-NEXT: // %bb.3: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: cbz w19, .LBB1_2 +; CHECK-NEXT: // %bb.4: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: cmp w19, #10 +; CHECK-NEXT: b.ne .LBB1_2 +; CHECK-NEXT: // %bb.5: // %bb9 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: str wzr, [x23] +; CHECK-NEXT: b .LBB1_2 +; CHECK-NEXT: .LBB1_6: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: cmp w19, #34 +; CHECK-NEXT: b.eq .LBB1_1 +; CHECK-NEXT: // %bb.7: // %bb8 +; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 +; CHECK-NEXT: cmp w19, #39 +; CHECK-NEXT: b.eq .LBB1_1 +; CHECK-NEXT: b .LBB1_2 +bb: + br label %bb7 + +bb7: ; preds = %bb13, %bb + %phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ] + br label %bb8 + +bb8: ; preds = %bb10, %bb9, %bb8, %bb7 + switch i32 %arg1, label %bb8 [ + i32 10, label %bb9 + i32 1, label %bb16 + i32 0, label %bb13 + i32 39, label %bb10 + i32 34, label %bb10 + ] + +bb9: ; preds = %bb8 + store i32 0, ptr %arg2, align 4 + br label %bb8 + +bb10: ; preds = %bb8, %bb8 + store i32 0, ptr %arg4, align 4 + tail call void @foo(i32 %arg5, ptr %arg3) + br label %bb8 + +bb13: ; preds = %bb8 + %not.arg6 = xor i1 %arg6, true + %spec.select = zext i1 %not.arg6 to i32 + br label %bb7 + +bb16: ; preds = %bb8 + unreachable +} -- GitLab From 40ea92c859234d536553cf26650e89d6e52071c6 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 21 Oct 2024 11:37:26 -0700 Subject: [PATCH 275/511] [lldb] Update ScriptInterpreterTests for CommandReturnObject API change --- .../ScriptInterpreter/Lua/ScriptInterpreterTests.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/unittests/ScriptInterpreter/Lua/ScriptInterpreterTests.cpp b/lldb/unittests/ScriptInterpreter/Lua/ScriptInterpreterTests.cpp index 2693bef3f5fb..214de14f73ff 100644 --- a/lldb/unittests/ScriptInterpreter/Lua/ScriptInterpreterTests.cpp +++ b/lldb/unittests/ScriptInterpreter/Lua/ScriptInterpreterTests.cpp @@ -48,6 +48,6 @@ TEST_F(ScriptInterpreterTest, ExecuteOneLine) { CommandReturnObject result(/*colors*/ false); EXPECT_TRUE(script_interpreter.ExecuteOneLine("foo = 1", &result)); EXPECT_FALSE(script_interpreter.ExecuteOneLine("nil = foo", &result)); - EXPECT_TRUE(result.GetErrorData().starts_with( - "error: lua failed attempting to evaluate 'nil = foo'")); + EXPECT_EQ(result.GetErrorString().find( + "error: lua failed attempting to evaluate 'nil = foo'"), 0 ); } -- GitLab From 622e398d8828431e082a336814d29932e22c8450 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 21 Oct 2024 11:57:18 -0700 Subject: [PATCH 276/511] [AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp (#112447) We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- clang/lib/CodeGen/CGBuiltin.cpp | 31 +++-- clang/lib/Sema/SemaAMDGPU.cpp | 38 ++++++ .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl | 110 +++++++++++++++++- .../SemaOpenCL/builtins-amdgcn-error-gfx9.cl | 50 ++++++++ 5 files changed, 220 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index c02970f55b22..e887213aa945 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -224,8 +224,8 @@ TARGET_BUILTIN(__builtin_amdgcn_frexp_exph, "sh", "nc", "16-bit-insts") TARGET_BUILTIN(__builtin_amdgcn_fracth, "hh", "nc", "16-bit-insts") TARGET_BUILTIN(__builtin_amdgcn_classh, "bhi", "nc", "16-bit-insts") TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "WUi", "n", "s-memrealtime") -TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp") -TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp") +TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nct", "dpp") +TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nct", "dpp") TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts") TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 28f28c70b5ae..1ad950798c21 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19037,15 +19037,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, ASTContext::GetBuiltinTypeError Error; getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); + llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); + unsigned Size = DataTy->getPrimitiveSizeInBits(); + llvm::Type *IntTy = + llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, IntTy); + assert(E->getNumArgs() == 5 || E->getNumArgs() == 6); + bool InsertOld = E->getNumArgs() == 5; + if (InsertOld) + Args.push_back(llvm::PoisonValue::get(IntTy)); for (unsigned I = 0; I != E->getNumArgs(); ++I) { - Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E)); + llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); + if (I <= !InsertOld && Size < 32) { + if (!DataTy->isIntegerTy()) + V = Builder.CreateBitCast( + V, llvm::IntegerType::get(Builder.getContext(), Size)); + V = Builder.CreateZExtOrBitCast(V, IntTy); + } + llvm::Type *ExpTy = + F->getFunctionType()->getFunctionParamType(I + InsertOld); + Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); } - assert(Args.size() == 5 || Args.size() == 6); - if (Args.size() == 5) - Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType())); - Function *F = - CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType()); - return Builder.CreateCall(F, Args); + Value *V = Builder.CreateCall(F, Args); + if (Size < 32 && !DataTy->isIntegerTy()) + V = Builder.CreateTrunc( + V, llvm::IntegerType::get(Builder.getContext(), Size)); + return Builder.CreateTruncOrBitCast(V, DataTy); } case AMDGPU::BI__builtin_amdgcn_permlane16: case AMDGPU::BI__builtin_amdgcn_permlanex16: diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index d11bc9eec330..9e05e8f28b2c 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -63,6 +63,44 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, OrderIndex = 0; ScopeIndex = 1; break; + case AMDGPU::BI__builtin_amdgcn_mov_dpp: { + if (SemaRef.checkArgCountRange(TheCall, 5, 5)) + return true; + Expr *ValArg = TheCall->getArg(0); + QualType Ty = ValArg->getType(); + // TODO: Vectors can also be supported. + if (!Ty->isArithmeticType() || Ty->isAnyComplexType()) { + SemaRef.Diag(ValArg->getBeginLoc(), + diag::err_typecheck_cond_expect_int_float) + << Ty << ValArg->getSourceRange(); + return true; + } + return false; + } + case AMDGPU::BI__builtin_amdgcn_update_dpp: { + if (SemaRef.checkArgCountRange(TheCall, 6, 6)) + return true; + Expr *Args[2]; + QualType ArgTys[2]; + for (unsigned I = 0; I != 2; ++I) { + Args[I] = TheCall->getArg(I); + ArgTys[I] = Args[I]->getType(); + // TODO: Vectors can also be supported. + if (!ArgTys[I]->isArithmeticType() || ArgTys[I]->isAnyComplexType()) { + SemaRef.Diag(Args[I]->getBeginLoc(), + diag::err_typecheck_cond_expect_int_float) + << ArgTys[I] << Args[I]->getSourceRange(); + return true; + } + } + if (ArgTys[0] != ArgTys[1]) { + SemaRef.Diag(Args[1]->getBeginLoc(), + diag::err_typecheck_call_different_arg_types) + << ArgTys[0] << ArgTys[1]; + return true; + } + return false; + } default: return false; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 5bd8f77a5930..65b54c1d5527 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -102,20 +102,122 @@ void test_s_dcache_wb() __builtin_amdgcn_s_dcache_wb(); } -// CHECK-LABEL: @test_mov_dpp +// CHECK-LABEL: @test_mov_dpp_int // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false) -void test_mov_dpp(global int* out, int src) +void test_mov_dpp_int(global int* out, int src) { *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false); } -// CHECK-LABEL: @test_update_dpp +// CHECK-LABEL: @test_mov_dpp_long +// CHECK: %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i64 %0, +void test_mov_dpp_long(long x, global long *p) { + *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_mov_dpp_float +// CHECK: %0 = bitcast float %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i32 %1, +void test_mov_dpp_float(float x, global float *p) { + *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_mov_dpp_double +// CHECK: %0 = bitcast double %x to i64 +// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i64 %1, +void test_mov_dpp_double(double x, global double *p) { + *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_mov_dpp_short +// CHECK: %0 = zext i16 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %2 = trunc i32 %1 to i16 +// CHECK-NEXT: store i16 %2, +void test_mov_dpp_short(short x, global short *p) { + *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_mov_dpp_char +// CHECK: %0 = zext i8 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %2 = trunc i32 %1 to i8 +// CHECK-NEXT: store i8 %2, +void test_mov_dpp_char(char x, global char *p) { + *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_mov_dpp_half +// CHECK: %0 = load i16, +// CHECK: %1 = zext i16 %0 to i32 +// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %3 = trunc i32 %2 to i16 +// CHECK-NEXT: store i16 %3, +void test_mov_dpp_half(half *x, global half *p) { + *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_int // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false) -void test_update_dpp(global int* out, int arg1, int arg2) +void test_update_dpp_int(global int* out, int arg1, int arg2) { *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false); } +// CHECK-LABEL: @test_update_dpp_long +// CHECK: %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false) +// CHECk-NEXT: store i64 %0, +void test_update_dpp_long(long x, global long *p) { + *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_float +// CHECK: %0 = bitcast float %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i32 %1, +void test_update_dpp_float(float x, global float *p) { + *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_double +// CHECK: %0 = bitcast double %x to i64 +// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i64 %1, +void test_update_dpp_double(double x, global double *p) { + *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_short +// CHECK: %0 = zext i16 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %2 = trunc i32 %1 to i16 +// CHECK-NEXT: store i16 %2, +void test_update_dpp_short(short x, global short *p) { + *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_char +// CHECK: %0 = zext i8 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %2 = trunc i32 %1 to i8 +// CHECK-NEXT: store i8 %2, +void test_update_dpp_char(char x, global char *p) { + *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); +} + +// CHECK-LABEL: @test_update_dpp_half +// CHECK: %0 = load i16, +// CHECK: %1 = zext i16 %0 to i32 +// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: %3 = trunc i32 %2 to i16 +// CHECK-NEXT: store i16 %3, +void test_update_dpp_half(half *x, global half *p) { + *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0); +} + // CHECK-LABEL: @test_ds_fadd // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} // CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl index c9fd8ab2cae8..47b56c703e4c 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl @@ -3,7 +3,57 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +typedef int int2 __attribute__((ext_vector_type(2))); + +struct S { + int x; +}; + void test_gfx9_fmed3h(global half *out, half a, half b, half c) { *out = __builtin_amdgcn_fmed3h(a, b, c); // expected-error {{'__builtin_amdgcn_fmed3h' needs target feature gfx9-insts}} } + +void test_mov_dpp(global int* out, int src, int i, int2 i2, struct S s, float _Complex fc) +{ + *out = __builtin_amdgcn_mov_dpp(src, i, 0, 0, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, i, 0, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, i, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, i); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0.1, 0, 0, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0.1, 0, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0.1, false); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, 0.1); // expected-error{{argument to '__builtin_amdgcn_mov_dpp' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0); // expected-error{{too few arguments to function call, expected 5, have 4}} + *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false, 1); // expected-error{{too many arguments to function call, expected at most 5, have 6}} + *out = __builtin_amdgcn_mov_dpp(out, 0, 0, 0, false); // expected-error{{used type '__global int *__private' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp("aa", 0, 0, 0, false); // expected-error{{used type '__constant char[3]' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp(i2, 0, 0, 0, false); // expected-error{{used type '__private int2' (vector of 2 'int' values) where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp(s, 0, 0, 0, false); // expected-error{{used type '__private struct S' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp(fc, 0, 0, 0, false); // expected-error{{used type '__private _Complex float' where integer or floating point type is required}} +} + +void test_update_dpp(global int* out, int arg1, int arg2, int i, int2 i2, long l, struct S s, float _Complex fc) +{ + *out = __builtin_amdgcn_update_dpp(arg1, arg2, i, 0, 0, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, i, 0, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, i, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, i); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0.1, 0, 0, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0.1, 0, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0.1, false); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, 0.1); // expected-error{{argument to '__builtin_amdgcn_update_dpp' must be a constant integer}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0); // expected-error{{too few arguments to function call, expected 6, have 5}} + *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false, 1); // expected-error{{too many arguments to function call, expected at most 6, have 7}} + *out = __builtin_amdgcn_update_dpp(out, arg2, 0, 0, 0, false); // expected-error{{used type '__global int *__private' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(arg1, out, 0, 0, 0, false); // expected-error{{used type '__global int *__private' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp("aa", arg2, 0, 0, 0, false); // expected-error{{used type '__constant char[3]' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(arg1, "aa", 0, 0, 0, false); // expected-error{{used type '__constant char[3]' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(i2, arg2, 0, 0, 0, false); // expected-error{{used type '__private int2' (vector of 2 'int' values) where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(arg1, i2, 0, 0, 0, false); // expected-error{{used type '__private int2' (vector of 2 'int' values) where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(s, arg2, 0, 0, 0, false); // expected-error{{used type '__private struct S' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(arg1, s, 0, 0, 0, false); // expected-error{{used type '__private struct S' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(fc, arg2, 0, 0, 0, false); // expected-error{{used type '__private _Complex float' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(arg1, fc, 0, 0, 0, false); // expected-error{{used type '__private _Complex float' where integer or floating point type is required}} + *out = __builtin_amdgcn_update_dpp(i, l, 0, 0, 0, false); // expected-error{{arguments are of different types ('__private int' vs '__private long')}} +} -- GitLab From 4b1b51ac52445f2308174287c721ad7f60a8053b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Oct 2024 11:45:16 -0700 Subject: [PATCH 277/511] [SLP]Initial non-power-of-2 support (but still whole register) for reductions Enables initial non-power-of-2 support (but still requires number of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/112361 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 523 ++++++++++++------ .../SLPVectorizer/AArch64/tsc-s116.ll | 11 +- .../SLPVectorizer/AArch64/vec3-calls.ll | 3 +- .../X86/gather-node-same-as-vect-but-order.ll | 15 +- .../SLPVectorizer/X86/horizontal-list.ll | 32 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 10 +- .../X86/non-power-of-2-order-detection.ll | 9 +- .../SLPVectorizer/X86/vec3-calls.ll | 3 +- .../X86/vect-gather-same-nodes.ll | 14 +- 9 files changed, 381 insertions(+), 239 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a11e3f3815cb..756b25ac9856 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -291,6 +291,8 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, if (NumParts == 0 || NumParts >= Sz) return bit_floor(Sz); unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts)); + if (RegVF > Sz) + return bit_floor(Sz); return (Sz / RegVF) * RegVF; } @@ -1505,6 +1507,12 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; + /// Checks if the graph and all its subgraphs cannot be better vectorized. + /// It may happen, if all gather nodes are loads and they cannot be + /// "clusterized". In this case even subgraphs cannot be vectorized more + /// effectively than the base graph. + bool isTreeNotExtendable() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, @@ -3047,7 +3055,9 @@ private: /// vector loads/masked gathers instead of regular gathers. Later these loads /// are reshufled to build final gathered nodes. void tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads); + const SmallMapVector, + SmallVector>>, + 8> &GatheredLoads); /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. @@ -3059,7 +3069,7 @@ private: /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. - DenseMap> + SmallVector> collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the @@ -4657,7 +4667,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes = true) { - if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) + if (getUnderlyingObject(Ptr1, RecursionMaxDepth) != + getUnderlyingObject(Ptr2, RecursionMaxDepth)) return false; auto *GEP1 = dyn_cast(Ptr1); auto *GEP2 = dyn_cast(Ptr2); @@ -5177,30 +5188,40 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, return LoadsState::Gather; } -static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, +static bool clusterSortPtrAccesses(ArrayRef VL, + ArrayRef BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { - assert(llvm::all_of( - VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && - "Expected list of pointer operands."); + assert( + all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each // Ptr into, sort and return the sorted indices with values next to one // another. - MapVector>> Bases; - Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); - - unsigned Cnt = 1; - for (Value *Ptr : VL.drop_front()) { - bool Found = any_of(Bases, [&](auto &Base) { - std::optional Diff = - getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, - /*StrictCheck=*/true); - if (!Diff) - return false; + SmallMapVector, + SmallVector>>, 8> + Bases; + Bases + .try_emplace(std::make_pair( + BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth))) + .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U); - Base.second.emplace_back(Ptr, *Diff, Cnt++); - return true; - }); + SortedIndices.clear(); + for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) { + auto Key = std::make_pair(BBs[Cnt + 1], + getUnderlyingObject(Ptr, RecursionMaxDepth)); + bool Found = any_of(Bases.try_emplace(Key).first->second, + [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { + std::optional Diff = getPointersDiff( + ElemTy, std::get<0>(Base.front()), ElemTy, + Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; + + Base.emplace_back(Ptr, *Diff, Cnt + 1); + return true; + }); if (!Found) { // If we haven't found enough to usefully cluster, return early. @@ -5208,71 +5229,66 @@ static bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, return false; // Not found already - add a new Base - Bases[Ptr].emplace_back(Ptr, 0, Cnt++); + Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1); } } - // For each of the bases sort the pointers by Offset and check if any of the - // base become consecutively allocated. - bool AnyConsecutive = false; - for (auto &Base : Bases) { - auto &Vec = Base.second; - if (Vec.size() > 1) { - llvm::stable_sort(Vec, [](const std::tuple &X, - const std::tuple &Y) { - return std::get<1>(X) < std::get<1>(Y); - }); - int InitialOffset = std::get<1>(Vec[0]); - AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) { - return std::get<1>(P.value()) == int(P.index()) + InitialOffset; - }); - } - } + if (Bases.size() == VL.size()) + return false; - // Fill SortedIndices array only if it looks worth-while to sort the ptrs. - SortedIndices.clear(); - if (!AnyConsecutive) + if (Bases.size() == 1 && (Bases.front().second.size() == 1 || + Bases.front().second.size() == VL.size())) return false; - // If we have a better order, also sort the base pointers by increasing - // (variable) values if possible, to try and keep the order more regular. In - // order to create a valid strict-weak order we cluster by the Root of gep - // chains and sort within each. - SmallVector> SortedBases; + // For each of the bases sort the pointers by Offset and check if any of the + // base become consecutively allocated. + auto ComparePointers = [](Value *Ptr1, Value *Ptr2) { + SmallPtrSet FirstPointers; + SmallPtrSet SecondPointers; + Value *P1 = Ptr1; + Value *P2 = Ptr2; + if (P1 == P2) + return false; + unsigned Depth = 0; + while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) && + Depth <= RecursionMaxDepth) { + FirstPointers.insert(P1); + SecondPointers.insert(P2); + P1 = getUnderlyingObject(P1, /*MaxLookup=*/1); + P2 = getUnderlyingObject(P2, /*MaxLookup=*/1); + ++Depth; + } + assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) && + "Unable to find matching root."); + return FirstPointers.contains(P2) && !SecondPointers.contains(P1); + }; for (auto &Base : Bases) { - Value *Strip = Base.first->stripInBoundsConstantOffsets(); - Value *Root = Strip; - while (auto *Gep = dyn_cast(Root)) - Root = Gep->getOperand(0); - SortedBases.emplace_back(Base.first, Strip, Root); - } - auto *Begin = SortedBases.begin(); - auto *End = SortedBases.end(); - while (Begin != End) { - Value *Root = std::get<2>(*Begin); - auto *Mid = std::stable_partition( - Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; }); - DenseMap> LessThan; - for (auto *I = Begin; I < Mid; ++I) - LessThan.try_emplace(std::get<1>(*I)); - for (auto *I = Begin; I < Mid; ++I) { - Value *V = std::get<1>(*I); - while (auto *Gep = dyn_cast(V)) { - V = Gep->getOperand(0); - if (LessThan.contains(V)) - LessThan[V][std::get<1>(*I)] = true; - } - } - std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) { - return LessThan[std::get<1>(V1)][std::get<1>(V2)]; + for (auto &Vec : Base.second) { + if (Vec.size() > 1) { + stable_sort(Vec, [](const std::tuple &X, + const std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + bool AnyConsecutive = + all_of(enumerate(Vec), [InitialOffset](const auto &P) { + return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + }); + // Fill SortedIndices array only if it looks worth-while to sort the + // ptrs. + if (!AnyConsecutive) + return false; + } + } + stable_sort(Base.second, [&](const auto &V1, const auto &V2) { + return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front())); }); - Begin = Mid; } - // Collect the final order of sorted indices - for (auto Base : SortedBases) - for (auto &T : Bases[std::get<0>(Base)]) - SortedIndices.push_back(std::get<2>(T)); + for (auto &T : Bases) + for (const auto &Vec : T.second) + for (const auto &P : Vec) + SortedIndices.push_back(std::get<2>(P)); assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); @@ -5286,15 +5302,19 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { SmallVector Ptrs; Ptrs.reserve(TE.Scalars.size()); + SmallVector BBs; + BBs.reserve(TE.Scalars.size()); for (Value *V : TE.Scalars) { auto *L = dyn_cast(V); if (!L || !L->isSimple()) return std::nullopt; Ptrs.push_back(L->getPointerOperand()); + BBs.push_back(L->getParent()); } BoUpSLP::OrdersType Order; - if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + if (!LoadEntriesToVectorize.contains(TE.Idx) && + clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order)) return std::move(Order); return std::nullopt; } @@ -5662,7 +5682,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars // has been auditted for correctness with non-power-of-two vectors. - if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) + if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; } @@ -6393,13 +6413,15 @@ void BoUpSLP::buildExternalUses( } } -DenseMap> +SmallVector> BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { - DenseMap> PtrToStoresMap; + SmallDenseMap, + SmallVector, 8> + PtrToStoresMap; for (unsigned Lane : seq(0, TE->Scalars.size())) { Value *V = TE->Scalars[Lane]; // Don't iterate over the users of constant data. - if (isa(V)) + if (!isa(V)) continue; // To save compilation time we don't visit if we have too many users. if (V->hasNUsesOrMore(UsesLimit)) @@ -6417,25 +6439,34 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { if (getTreeEntry(U)) continue; - Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); - auto &StoresVec = PtrToStoresMap[Ptr]; + Value *Ptr = + getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth); + auto &StoresVec = PtrToStoresMap[{SI->getParent(), + SI->getValueOperand()->getType(), Ptr}]; // For now just keep one store per pointer object per lane. // TODO: Extend this to support multiple stores per pointer per lane if (StoresVec.size() > Lane) continue; - // Skip if in different BBs. - if (!StoresVec.empty() && - SI->getParent() != StoresVec.back()->getParent()) - continue; - // Make sure that the stores are of the same type. - if (!StoresVec.empty() && - SI->getValueOperand()->getType() != - StoresVec.back()->getValueOperand()->getType()) - continue; + if (!StoresVec.empty()) { + std::optional Diff = getPointersDiff( + SI->getValueOperand()->getType(), SI->getPointerOperand(), + SI->getValueOperand()->getType(), + StoresVec.front()->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + // We failed to compare the pointers so just abandon this store. + if (!Diff) + continue; + } StoresVec.push_back(SI); } } - return PtrToStoresMap; + SmallVector> Res(PtrToStoresMap.size()); + unsigned I = 0; + for (auto &P : PtrToStoresMap) { + Res[I].swap(P.second); + ++I; + } + return Res; } bool BoUpSLP::canFormVector(ArrayRef StoresVec, @@ -6445,9 +6476,9 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, // To avoid calling getPointersDiff() while sorting we create a vector of // pairs {store, offset from first} and sort this instead. - SmallVector> StoreOffsetVec(StoresVec.size()); + SmallVector> StoreOffsetVec; StoreInst *S0 = StoresVec[0]; - StoreOffsetVec[0] = {S0, 0}; + StoreOffsetVec.emplace_back(0, 0); Type *S0Ty = S0->getValueOperand()->getType(); Value *S0Ptr = S0->getPointerOperand(); for (unsigned Idx : seq(1, StoresVec.size())) { @@ -6456,41 +6487,36 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); - // We failed to compare the pointers so just abandon this StoresVec. - if (!Diff) - return false; - StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff}; + StoreOffsetVec.emplace_back(*Diff, Idx); } - // Sort the vector based on the pointers. We create a copy because we may - // need the original later for calculating the reorder (shuffle) indices. - stable_sort(StoreOffsetVec, [](const std::pair &Pair1, - const std::pair &Pair2) { - int Offset1 = Pair1.second; - int Offset2 = Pair2.second; - return Offset1 < Offset2; - }); - // Check if the stores are consecutive by checking if their difference is 1. - for (unsigned Idx : seq(1, StoreOffsetVec.size())) - if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1) + if (StoreOffsetVec.size() != StoresVec.size()) + return false; + sort(StoreOffsetVec, + [](const std::pair &L, + const std::pair &R) { return L.first < R.first; }); + unsigned Idx = 0; + int PrevDist = 0; + for (const auto &P : StoreOffsetVec) { + if (Idx > 0 && P.first != PrevDist + 1) return false; + PrevDist = P.first; + ++Idx; + } // Calculate the shuffle indices according to their offset against the sorted // StoreOffsetVec. - ReorderIndices.reserve(StoresVec.size()); - for (StoreInst *SI : StoresVec) { - unsigned Idx = find_if(StoreOffsetVec, - [SI](const std::pair &Pair) { - return Pair.first == SI; - }) - - StoreOffsetVec.begin(); - ReorderIndices.push_back(Idx); + ReorderIndices.assign(StoresVec.size(), 0); + bool IsIdentity = true; + for (auto [I, P] : enumerate(StoreOffsetVec)) { + ReorderIndices[P.second] = I; + IsIdentity &= P.second == I; } // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in // reorderTopToBottom() and reorderBottomToTop(), so we are following the // same convention here. - if (isIdentityOrder(ReorderIndices)) + if (IsIdentity) ReorderIndices.clear(); return true; @@ -6508,8 +6534,7 @@ SmallVector BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { unsigned NumLanes = TE->Scalars.size(); - DenseMap> PtrToStoresMap = - collectUserStores(TE); + SmallVector> Stores = collectUserStores(TE); // Holds the reorder indices for each candidate store vector that is a user of // the current TreeEntry. @@ -6518,8 +6543,7 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { // Now inspect the stores collected per pointer and look for vectorization // candidates. For each candidate calculate the reorder index vector and push // it into `ExternalReorderIndices` - for (const auto &Pair : PtrToStoresMap) { - auto &StoresVec = Pair.second; + for (ArrayRef StoresVec : Stores) { // If we have fewer than NumLanes stores, then we can't form a vector. if (StoresVec.size() != NumLanes) continue; @@ -6574,9 +6598,13 @@ static void gatherPossiblyVectorizableLoads( continue; bool IsFound = false; for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) { - if (LI->getParent() != Data.front().first->getParent() || - LI->getType() != Data.front().first->getType()) - continue; + assert(LI->getParent() == Data.front().first->getParent() && + LI->getType() == Data.front().first->getType() && + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) == + getUnderlyingObject(Data.front().first->getPointerOperand(), + RecursionMaxDepth) && + "Expected loads with the same type, same parent and same " + "underlying pointer."); std::optional Dist = getPointersDiff( LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), Data.front().first->getPointerOperand(), DL, SE, @@ -6704,7 +6732,9 @@ static void gatherPossiblyVectorizableLoads( } void BoUpSLP::tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads) { + const SmallMapVector, + SmallVector>>, + 8> &GatheredLoads) { GatheredLoadsEntriesFirst = VectorizableTree.size(); SmallVector> LoadSetsToVectorize( @@ -6737,7 +6767,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads( SmallVector CandidateVFs; if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1)) CandidateVFs.push_back(MaxVF); - for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) { + for (int NumElts = getFloorFullVectorNumberOfElements( + *TTI, Loads.front()->getType(), MaxVF); + NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( + *TTI, Loads.front()->getType(), NumElts - 1)) { CandidateVFs.push_back(NumElts); if (VectorizeNonPowerOf2 && NumElts > 2) CandidateVFs.push_back(NumElts - 1); @@ -6751,9 +6784,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads( if (Final && NumElts > BestVF) continue; SmallVector MaskedGatherVectorized; - for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; ++Cnt) { - ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); + ArrayRef Slice = + ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt)); if (VectorizedLoads.count(Slice.front()) || VectorizedLoads.count(Slice.back()) || areKnownNonVectorizableLoads(Slice)) @@ -7099,24 +7133,27 @@ void BoUpSLP::tryToVectorizeGatheredLoads( } return NonVectorized; }; - SmallVector NonVectorized = ProcessGatheredLoads(GatheredLoads); - if (!GatheredLoads.empty() && !NonVectorized.empty() && - std::accumulate( - GatheredLoads.begin(), GatheredLoads.end(), 0u, - [](unsigned S, ArrayRef> LoadsDists) { - return S + LoadsDists.size(); - }) != NonVectorized.size() && - IsMaskedGatherSupported(NonVectorized)) { - SmallVector>> FinalGatheredLoads; - for (LoadInst *LI : NonVectorized) { - // Reinsert non-vectorized loads to other list of loads with the same - // base pointers. - gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, - FinalGatheredLoads, - /*AddNew=*/false); - } - // Final attempt to vectorize non-vectorized loads. - (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); + for (const auto &GLs : GatheredLoads) { + const auto &Ref = GLs.second; + SmallVector NonVectorized = ProcessGatheredLoads(Ref); + if (!Ref.empty() && !NonVectorized.empty() && + std::accumulate( + Ref.begin(), Ref.end(), 0u, + [](unsigned S, ArrayRef> LoadsDists) { + return S + LoadsDists.size(); + }) != NonVectorized.size() && + IsMaskedGatherSupported(NonVectorized)) { + SmallVector>> FinalGatheredLoads; + for (LoadInst *LI : NonVectorized) { + // Reinsert non-vectorized loads to other list of loads with the same + // base pointers. + gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, + FinalGatheredLoads, + /*AddNew=*/false); + } + // Final attempt to vectorize non-vectorized loads. + (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); + } } // Try to vectorize postponed load entries, previously marked as gathered. for (unsigned Idx : LoadEntriesToVectorize) { @@ -7363,13 +7400,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) { assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); - if (S.MainOp->getType()->isFloatingPointTy() && - TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { - auto *I = dyn_cast(V); - return I && (I->isBinaryOp() || isa(I)) && !I->isFast(); - })) - return TreeEntry::NeedToGather; - unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); auto *VL0 = cast(S.OpValue); @@ -7534,6 +7564,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::Or: case Instruction::Xor: case Instruction::Freeze: + if (S.MainOp->getType()->isFloatingPointTy() && + TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { + auto *I = dyn_cast(V); + return I && I->isBinaryOp() && !I->isFast(); + })) + return TreeEntry::NeedToGather; return TreeEntry::Vectorize; case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. @@ -7625,6 +7661,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::NeedToGather; } case Instruction::Call: { + if (S.MainOp->getType()->isFloatingPointTy() && + TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) { + auto *I = dyn_cast(V); + return I && !I->isFast(); + })) + return TreeEntry::NeedToGather; // Check if the calls are all to the same vectorizable intrinsic or // library function. CallInst *CI = cast(VL0); @@ -9344,8 +9386,13 @@ void BoUpSLP::transformNodes() { // insertvector instructions. unsigned StartIdx = 0; unsigned End = VL.size(); - for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) { - SmallVector Slices; + for (unsigned VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VL.size() - 1); + VF >= MinVF; VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VF - 1)) { + if (StartIdx + VF > End) + continue; + SmallVector> Slices; for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. @@ -9375,7 +9422,10 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice)) + if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) || + (S.getOpcode() == Instruction::Load && + areKnownNonVectorizableLoads(Slice)) || + (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) continue; if (VF == 2) { // Try to vectorize reduced values or if all users are vectorized. @@ -9395,8 +9445,16 @@ void BoUpSLP::transformNodes() { canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); // Do not vectorize gathers. if (Res == LoadsState::ScatterVectorize || - Res == LoadsState::Gather) + Res == LoadsState::Gather) { + if (Res == LoadsState::Gather) { + registerNonVectorizableLoads(Slice); + // If reductions and the scalars from the root node are + // analyzed - mark as non-vectorizable reduction. + if (UserIgnoreList && E.Idx == 0) + analyzedReductionVals(Slice); + } continue; + } } else if (S.getOpcode() == Instruction::ExtractElement || (TTI->getInstructionCost( cast(Slice.front()), CostKind) < @@ -9411,17 +9469,17 @@ void BoUpSLP::transformNodes() { } } } - Slices.emplace_back(Cnt); + Slices.emplace_back(Cnt, Slice.size()); } - auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) { + auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); if (StartIdx == Cnt) - StartIdx = Cnt + VF; - if (End == Cnt + VF) + StartIdx = Cnt + Sz; + if (End == Cnt + Sz) End = Cnt; }; - for (unsigned Cnt : Slices) { - ArrayRef Slice = VL.slice(Cnt, VF); + for (auto [Cnt, Sz] : Slices) { + ArrayRef Slice = VL.slice(Cnt, Sz); // If any instruction is vectorized already - do not try again. if (TreeEntry *SE = getTreeEntry(Slice.front()); SE || getTreeEntry(Slice.back())) { @@ -9430,7 +9488,7 @@ void BoUpSLP::transformNodes() { if (VF != SE->getVectorFactor() || !SE->isSame(Slice)) continue; SE->UserTreeIndices.emplace_back(&E, UINT_MAX); - AddCombinedNode(SE->Idx, Cnt); + AddCombinedNode(SE->Idx, Cnt, Sz); continue; } unsigned PrevSize = VectorizableTree.size(); @@ -9442,12 +9500,14 @@ void BoUpSLP::transformNodes() { VectorizableTree[PrevSize]->getOpcode() != Instruction::ExtractElement && !isSplat(Slice)) { + if (UserIgnoreList && E.Idx == 0 && VF == 2) + analyzedReductionVals(Slice); VectorizableTree.pop_back(); assert(PrevEntriesSize == LoadEntriesToVectorize.size() && "LoadEntriesToVectorize expected to remain the same"); continue; } - AddCombinedNode(PrevSize, Cnt); + AddCombinedNode(PrevSize, Cnt, Sz); } } } @@ -9542,11 +9602,24 @@ void BoUpSLP::transformNodes() { VectorizableTree.front()->Scalars.size() == SmallVF) || (VectorizableTree.size() <= 2 && UserIgnoreList)) return; + + if (VectorizableTree.front()->isNonPowOf2Vec() && + getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && + getCanonicalGraphSize() <= SmallTree && + count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), + [](const std::unique_ptr &TE) { + return TE->isGather() && + TE->getOpcode() == Instruction::Load && + !allSameBlock(TE->Scalars); + }) == 1) + return; } // A list of loads to be gathered during the vectorization process. We can // try to vectorize them at the end, if profitable. - SmallVector>> GatheredLoads; + SmallMapVector, + SmallVector>>, 8> + GatheredLoads; for (std::unique_ptr &TE : VectorizableTree) { TreeEntry &E = *TE; @@ -9558,9 +9631,21 @@ void BoUpSLP::transformNodes() { !isVectorized(V) && !isDeleted(cast(V)); }))) && - !isSplat(E.Scalars)) - gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI, - GatheredLoads); + !isSplat(E.Scalars)) { + for (Value *V : E.Scalars) { + auto *LI = dyn_cast(V); + if (!LI) + continue; + if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple()) + continue; + gatherPossiblyVectorizableLoads( + *this, V, *DL, *SE, *TTI, + GatheredLoads[std::make_tuple( + LI->getParent(), + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth), + LI->getType())]); + } + } } // Try to vectorize gathered loads if this is not just a gather of loads. if (!GatheredLoads.empty()) @@ -11515,6 +11600,34 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { return true; } +bool BoUpSLP::isTreeNotExtendable() const { + if (getCanonicalGraphSize() != getTreeSize()) { + constexpr unsigned SmallTree = 3; + if (VectorizableTree.front()->isNonPowOf2Vec() && + getCanonicalGraphSize() <= SmallTree && + count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), + [](const std::unique_ptr &TE) { + return TE->isGather() && + TE->getOpcode() == Instruction::Load && + !allSameBlock(TE->Scalars); + }) == 1) + return true; + return false; + } + bool Res = false; + for (unsigned Idx : seq(getTreeSize())) { + TreeEntry &E = *VectorizableTree[Idx]; + if (!E.isGather()) + continue; + if (E.getOpcode() && E.getOpcode() != Instruction::Load) + return false; + if (isSplat(E.Scalars) || allConstant(E.Scalars)) + continue; + Res = true; + } + return Res; +} + InstructionCost BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, @@ -18771,7 +18884,8 @@ public: auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { Key = hash_combine(hash_value(LI->getParent()), Key); - Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); + Value *Ptr = + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); if (!LoadKeyUsed.insert(Key).second) { auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); if (LIt != LoadsMap.end()) { @@ -19094,8 +19208,28 @@ public: RegMaxNumber * RedValsMaxNumber); unsigned ReduxWidth = NumReducedVals; + auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { + unsigned NumParts, NumRegs; + Type *ScalarTy = Candidates.front()->getType(); + ReduxWidth = + getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + NumParts = TTI.getNumberOfParts(Tp); + NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + while (NumParts > NumRegs) { + ReduxWidth = bit_floor(ReduxWidth - 1); + VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth); + NumParts = TTI.getNumberOfParts(Tp); + NumRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp)); + } + if (NumParts > NumRegs / 2) + ReduxWidth = bit_floor(ReduxWidth); + return ReduxWidth; + }; if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1)) - ReduxWidth = bit_floor(ReduxWidth); + ReduxWidth = GetVectorFactor(ReduxWidth); ReduxWidth = std::min(ReduxWidth, MaxElts); unsigned Start = 0; @@ -19103,10 +19237,7 @@ public: // Restarts vectorization attempt with lower vector factor. unsigned PrevReduxWidth = ReduxWidth; bool CheckForReusedReductionOpsLocal = false; - auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals, - &CheckForReusedReductionOpsLocal, - &PrevReduxWidth, &V, - &IgnoreList](bool IgnoreVL = false) { + auto AdjustReducedVals = [&](bool IgnoreVL = false) { bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList); if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { // Check if any of the reduction ops are gathered. If so, worth @@ -19117,10 +19248,13 @@ public: if (Pos < NumReducedVals - ReduxWidth + 1) return IsAnyRedOpGathered; Pos = Start; - ReduxWidth = bit_ceil(ReduxWidth) / 2; + --ReduxWidth; + if (ReduxWidth > 1) + ReduxWidth = GetVectorFactor(ReduxWidth); return IsAnyRedOpGathered; }; bool AnyVectorized = false; + SmallDenseSet, 8> IgnoredCandidates; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -19132,8 +19266,15 @@ public: } PrevReduxWidth = ReduxWidth; ArrayRef VL(std::next(Candidates.begin(), Pos), ReduxWidth); - // Beeing analyzed already - skip. - if (V.areAnalyzedReductionVals(VL)) { + // Been analyzed already - skip. + if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) || + (!has_single_bit(ReduxWidth) && + (IgnoredCandidates.contains( + std::make_pair(Pos, bit_floor(ReduxWidth))) || + IgnoredCandidates.contains( + std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)), + bit_floor(ReduxWidth))))) || + V.areAnalyzedReductionVals(VL)) { (void)AdjustReducedVals(/*IgnoreVL=*/true); continue; } @@ -19239,8 +19380,24 @@ public: << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); }); - if (!AdjustReducedVals()) + if (!AdjustReducedVals()) { V.analyzedReductionVals(VL); + unsigned Offset = Pos == Start ? Pos : Pos - 1; + if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) { + // Add subvectors of VL to the list of the analyzed values. + for (unsigned VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), ReduxWidth - 1); + VF >= ReductionLimit; + VF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), VF - 1)) { + if (has_single_bit(VF) && + V.getCanonicalGraphSize() != V.getTreeSize()) + continue; + for (unsigned Idx : seq(ReduxWidth - VF)) + IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF)); + } + } + } continue; } @@ -19349,7 +19506,9 @@ public: } Pos += ReduxWidth; Start = Pos; - ReduxWidth = llvm::bit_floor(NumReducedVals - Pos); + ReduxWidth = NumReducedVals - Pos; + if (ReduxWidth > 1) + ReduxWidth = GetVectorFactor(NumReducedVals - Pos); AnyVectorized = true; } if (OptReusedScalars && !AnyVectorized) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index fffa626cae0d..c431b058f0d2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,18 +17,17 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 833bc56c4ec6..2191d04cd797 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POWER-OF-2-NEXT: entry: ; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) ; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POWER-OF-2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index 757d0b1708b6..234b65803238 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -11,19 +11,21 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0) ; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] -; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: entry.if.end72_crit_edge: ; CHECK-NEXT: br label [[IF_END72:%.*]] @@ -46,8 +48,7 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 72e29839230e..c9ff2d6426d2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -318,22 +318,14 @@ entry: define float @f(ptr nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]]) ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] @@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) { ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 ; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) ; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index a7201e776fb4..0bc91d42b0f1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 -; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) -; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) -; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) +; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6) ; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) ; THRESH-NEXT: ret i32 [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll index 47dd84c7f6e9..4898111960c0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll @@ -7,10 +7,11 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C]], align 8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2 @@ -18,7 +19,7 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0) ; CHECK-NEXT: [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4) ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index fd3c1a57aff3..a821362a883a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0) ; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POW2-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index e1b091cc6fcd..9719e60a6a69 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] -- GitLab From dca43a1c82f1023127343daae487c3a6a8c7e3d4 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 21 Oct 2024 15:45:51 -0400 Subject: [PATCH 278/511] [lld/Macho][test] Mark objc-category-merging-minimal.s as unsupported on Windows (#113209) With #112981, the test uses awk, which gnuwin32 doesn't seem to have. --- lld/test/MachO/objc-category-merging-minimal.s | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lld/test/MachO/objc-category-merging-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s index 88c175333f26..d4d5933aa5ec 100644 --- a/lld/test/MachO/objc-category-merging-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -1,4 +1,7 @@ # REQUIRES: aarch64 +# UNSUPPORTED: system-windows +# due to awk usage + # RUN: rm -rf %t; split-file %s %t && cd %t ############ Test merging multiple categories into a single category ############ -- GitLab From 009fb567ceb9a8afea3c13b5eb943a1f15fdf3b5 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 21 Oct 2024 21:06:48 +0100 Subject: [PATCH 279/511] [AArch64] Add patterns for combining qxtn+rshr to qrshrn Similar to bd861d0e690cfd05184d86, this adds some patterns for converting signed and unsigned variants of rshr+qxtn to qrshrn. --- .../Target/AArch64/AArch64ISelLowering.cpp | 23 ++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 +- llvm/test/CodeGen/AArch64/rqshrn.ll | 371 ++++++++++++++++++ 3 files changed, 400 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/rqshrn.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bf2f0674b5b6..4aa123b42d19 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5971,6 +5971,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2))); return SDValue(); + case Intrinsic::aarch64_neon_sqrshrn: + if (Op.getValueType().isVector()) + return DAG.getNode( + ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(), + DAG.getNode( + AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_sqrshrun: + if (Op.getValueType().isVector()) + return DAG.getNode( + ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(), + DAG.getNode( + AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(), + Op.getOperand(1), Op.getOperand(2))); + return SDValue(); + case Intrinsic::aarch64_neon_uqrshrn: + if (Op.getValueType().isVector()) + return DAG.getNode( + ISD::TRUNCATE_USAT_U, dl, Op.getValueType(), + DAG.getNode( + AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2))); + return SDValue(); case Intrinsic::aarch64_sve_whilelo: return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/false); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1eb93066cfd8..4bd36e9eacbc 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8001,15 +8001,15 @@ def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", - int_aarch64_neon_sqrshrn>; + BinOpFrag<(truncssat_s (AArch64srshri node:$LHS, node:$RHS))>>; defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", - int_aarch64_neon_sqrshrun>; + BinOpFrag<(truncssat_u (AArch64srshri node:$LHS, node:$RHS))>>; defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", - BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>; + BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", - BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>; + BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>; defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), @@ -8027,10 +8027,10 @@ defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", - int_aarch64_neon_uqrshrn>; + BinOpFrag<(truncusat_u (AArch64urshri node:$LHS, node:$RHS))>>; defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", - BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>; + BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>; defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", TriOpFrag<(add node:$LHS, diff --git a/llvm/test/CodeGen/AArch64/rqshrn.ll b/llvm/test/CodeGen/AArch64/rqshrn.ll new file mode 100644 index 000000000000..e7522f5cd3ab --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rqshrn.ll @@ -0,0 +1,371 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version -5 +; RUN: llc %s -mtriple=aarch64 -o - | FileCheck %s + +define <4 x i16> @NarrowAShrI32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrU32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrU32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: srshr v0.4s, v0.4s, #5 +; CHECK-NEXT: uqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrI32By5ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By5ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrun v0.4h, v0.4s, #5 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrI32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrI32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrU32By5(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrU32By5: +; CHECK: // %bb.0: +; CHECK-NEXT: uqrshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrI32By5ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrI32By5ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.4s, v0.4s, #5 +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + + +define <2 x i32> @NarrowAShri64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShri64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowAShrU64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShrU64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: srshr v0.2d, v0.2d, #5 +; CHECK-NEXT: uqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowAShri64By5ToU32(<2 x i64> %x) { +; CHECK-LABEL: NarrowAShri64By5ToU32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrun v0.2s, v0.2d, #5 +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShri64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShri64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtn v0.2s, v0.2d +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShrU64By5(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShrU64By5: +; CHECK: // %bb.0: +; CHECK-NEXT: uqrshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + +define <2 x i32> @NarrowLShri64By5ToU32(<2 x i64> %x) { +; CHECK-LABEL: NarrowLShri64By5ToU32: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.2d, v0.2d, #5 +; CHECK-NEXT: sqxtun v0.2s, v0.2d +; CHECK-NEXT: ret + %s = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %x, <2 x i64> ) + %r = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %s) + ret <2 x i32> %r +} + + +define <8 x i8> @NarrowAShri16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShri16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowAShrU16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShrU16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: srshr v0.8h, v0.8h, #5 +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowAShri16By5ToU8(<8 x i16> %x) { +; CHECK-LABEL: NarrowAShri16By5ToU8: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrun v0.8b, v0.8h, #5 +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShri16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShri16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShrU16By5(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShrU16By5: +; CHECK: // %bb.0: +; CHECK-NEXT: uqrshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + +define <8 x i8> @NarrowLShri16By5ToU8(<8 x i16> %x) { +; CHECK-LABEL: NarrowLShri16By5ToU8: +; CHECK: // %bb.0: +; CHECK-NEXT: urshr v0.8h, v0.8h, #5 +; CHECK-NEXT: sqxtun v0.8b, v0.8h +; CHECK-NEXT: ret + %s = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %x, <8 x i16> ) + %r = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %s) + ret <8 x i8> %r +} + + + + + +define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By31: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrn v0.4h, v0.4s, #16 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) { +; CHECK-LABEL: NarrowAShrI32By31ToU16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqrshrun v0.4h, v0.4s, #16 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + +define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) { +; CHECK-LABEL: NarrowLShrU32By31: +; CHECK: // %bb.0: +; CHECK-NEXT: uqrshrn v0.4h, v0.4s, #16 +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.aarch64.neon.urshl(<4 x i32> %x, <4 x i32> ) + %r = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %s) + ret <4 x i16> %r +} + + +define <16 x i8> @signed_minmax_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: signed_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: sqrshrn2 v0.16b, v1.8h, #5 +; CHECK-NEXT: ret +entry: + %l = call <8 x i16> @llvm.aarch64.neon.srshl(<8 x i16> %x, <8 x i16> ) + %h = call <8 x i16> @llvm.aarch64.neon.srshl(<8 x i16> %y, <8 x i16> ) + %s = shufflevector <8 x i16> %l, <8 x i16> %h, <16 x i32> + %min = call <16 x i16> @llvm.smin.v8i16(<16 x i16> %s, <16 x i16> ) + %max = call <16 x i16> @llvm.smax.v8i16(<16 x i16> %min, <16 x i16> ) + %trunc = trunc <16 x i16> %max to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @unsigned_minmax_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: unsigned_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uqrshrn v0.8b, v0.8h, #5 +; CHECK-NEXT: uqrshrn2 v0.16b, v1.8h, #5 +; CHECK-NEXT: ret +entry: + %l = call <8 x i16> @llvm.aarch64.neon.urshl(<8 x i16> %x, <8 x i16> ) + %h = call <8 x i16> @llvm.aarch64.neon.urshl(<8 x i16> %y, <8 x i16> ) + %s = shufflevector <8 x i16> %l, <8 x i16> %h, <16 x i32> + %min = call <16 x i16> @llvm.umin.v8i16(<16 x i16> %s, <16 x i16> ) + %trunc = trunc <16 x i16> %min to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @unsigned_signed_minmax_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: unsigned_signed_minmax_v8i16_to_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrun v0.8b, v0.8h, #5 +; CHECK-NEXT: sqrshrun2 v0.16b, v1.8h, #5 +; CHECK-NEXT: ret +entry: + %l = call <8 x i16> @llvm.aarch64.neon.srshl(<8 x i16> %x, <8 x i16> ) + %h = call <8 x i16> @llvm.aarch64.neon.srshl(<8 x i16> %y, <8 x i16> ) + %s = shufflevector <8 x i16> %l, <8 x i16> %h, <16 x i32> + %max = call <16 x i16> @llvm.smax.v8i16(<16 x i16> %s, <16 x i16> ) + %min = call <16 x i16> @llvm.umin.v8i16(<16 x i16> %max, <16 x i16> ) + %trunc = trunc <16 x i16> %min to <16 x i8> + ret <16 x i8> %trunc +} + + +define <8 x i16> @signed_minmax_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: signed_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: sqrshrn2 v0.8h, v1.4s, #5 +; CHECK-NEXT: ret +entry: + %l = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %x, <4 x i32> ) + %h = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %y, <4 x i32> ) + %s = shufflevector <4 x i32> %l, <4 x i32> %h, <8 x i32> + %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %s, <8 x i32> ) + %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %min, <8 x i32> ) + %trunc = trunc <8 x i32> %max to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @unsigned_minmax_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: unsigned_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uqrshrn v0.4h, v0.4s, #5 +; CHECK-NEXT: uqrshrn2 v0.8h, v1.4s, #5 +; CHECK-NEXT: ret +entry: + %l = call <4 x i32> @llvm.aarch64.neon.urshl(<4 x i32> %x, <4 x i32> ) + %h = call <4 x i32> @llvm.aarch64.neon.urshl(<4 x i32> %y, <4 x i32> ) + %s = shufflevector <4 x i32> %l, <4 x i32> %h, <8 x i32> + %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %s, <8 x i32> ) + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @unsigned_signed_minmax_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: unsigned_signed_minmax_v4i32_to_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrun v0.4h, v0.4s, #5 +; CHECK-NEXT: sqrshrun2 v0.8h, v1.4s, #5 +; CHECK-NEXT: ret +entry: + %l = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %x, <4 x i32> ) + %h = call <4 x i32> @llvm.aarch64.neon.srshl(<4 x i32> %y, <4 x i32> ) + %s = shufflevector <4 x i32> %l, <4 x i32> %h, <8 x i32> + %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %s, <8 x i32> ) + %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %max, <8 x i32> ) + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + + +define <4 x i32> @signed_minmax_v4i64_to_v8i32(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: signed_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: sqrshrn2 v0.4s, v1.2d, #5 +; CHECK-NEXT: ret +entry: + %l = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %x, <2 x i64> ) + %h = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %y, <2 x i64> ) + %s = shufflevector <2 x i64> %l, <2 x i64> %h, <4 x i32> + %min = call <4 x i64> @llvm.smin.v8i64(<4 x i64> %s, <4 x i64> ) + %max = call <4 x i64> @llvm.smax.v8i64(<4 x i64> %min, <4 x i64> ) + %trunc = trunc <4 x i64> %max to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @unsigned_minmax_v4i64_to_v8i32(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: unsigned_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uqrshrn v0.2s, v0.2d, #5 +; CHECK-NEXT: uqrshrn2 v0.4s, v1.2d, #5 +; CHECK-NEXT: ret +entry: + %l = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %x, <2 x i64> ) + %h = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %y, <2 x i64> ) + %s = shufflevector <2 x i64> %l, <2 x i64> %h, <4 x i32> + %min = call <4 x i64> @llvm.umin.v8i64(<4 x i64> %s, <4 x i64> ) + %trunc = trunc <4 x i64> %min to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @unsigned_signed_minmax_v4i64_to_v8i32(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: unsigned_signed_minmax_v4i64_to_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrshrun v0.2s, v0.2d, #5 +; CHECK-NEXT: sqrshrun2 v0.4s, v1.2d, #5 +; CHECK-NEXT: ret +entry: + %l = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %x, <2 x i64> ) + %h = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %y, <2 x i64> ) + %s = shufflevector <2 x i64> %l, <2 x i64> %h, <4 x i32> + %max = call <4 x i64> @llvm.smax.v8i64(<4 x i64> %s, <4 x i64> ) + %min = call <4 x i64> @llvm.umin.v8i64(<4 x i64> %max, <4 x i64> ) + %trunc = trunc <4 x i64> %min to <4 x i32> + ret <4 x i32> %trunc +} -- GitLab From 3903cb4695012fb85a76c83b5616f2ffe6fa10f4 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Mon, 21 Oct 2024 13:19:21 -0700 Subject: [PATCH 280/511] [bazel] Use rules_python load statements (#113213) With bazel 8.x these are strongly encouraged, and this disambiguates which version of these rules we get for older versions. Specifically the native.py_test was using the wrong version of py_test. --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 + utils/bazel/llvm-project-overlay/lldb/BUILD.bazel | 1 + utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 + utils/bazel/llvm-project-overlay/llvm/lit_test.bzl | 4 ++-- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index d52dd4870f16..1facb31bb3f4 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -2,6 +2,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +load("@rules_python//python:defs.bzl", "py_binary") load( "//:vars.bzl", "LLVM_VERSION_MAJOR", diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index 3ed4f552290d..9dba2efc34f6 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -6,6 +6,7 @@ load("@bazel_skylib//lib:selects.bzl", "selects") load("@bazel_skylib//rules:common_settings.bzl", "bool_flag") load("@bazel_skylib//rules:expand_template.bzl", "expand_template") load("@build_bazel_apple_support//rules:apple_genrule.bzl", "apple_genrule") +load("@rules_python//python:defs.bzl", "py_binary") load("//:vars.bzl", "LLVM_VERSION_MAJOR", "LLVM_VERSION_MINOR", "LLVM_VERSION_PATCH", "LLVM_VERSION_SUFFIX", "PACKAGE_VERSION") load("//lldb/source/Plugins:plugin_config.bzl", "DEFAULT_PLUGINS", "DEFAULT_SCRIPT_PLUGINS", "OBJCPP_COPTS") load("//mlir:tblgen.bzl", "gentbl_cc_library", "td_library") diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index cea80ce57027..af381ac378f8 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -4,6 +4,7 @@ load("@bazel_skylib//rules:common_settings.bzl", "string_flag") load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load("@rules_python//python:defs.bzl", "py_binary") load("//mlir:tblgen.bzl", "td_library") load(":binary_alias.bzl", "binary_alias") load(":config.bzl", "llvm_config_defines") diff --git a/utils/bazel/llvm-project-overlay/llvm/lit_test.bzl b/utils/bazel/llvm-project-overlay/llvm/lit_test.bzl index af7ae560768d..0af12a635855 100644 --- a/utils/bazel/llvm-project-overlay/llvm/lit_test.bzl +++ b/utils/bazel/llvm-project-overlay/llvm/lit_test.bzl @@ -4,6 +4,7 @@ """Rules for running lit tests.""" load("@bazel_skylib//lib:paths.bzl", "paths") +load("@rules_python//python:defs.bzl", "py_test") def lit_test( name, @@ -29,8 +30,7 @@ def lit_test( args = args or [] data = data or [] deps = deps or [] - - native.py_test( + py_test( name = name, srcs = [Label("//llvm:lit")], main = Label("//llvm:utils/lit/lit.py"), -- GitLab From 6e1a7ac53163c335868d5773b1b35b55828f329c Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Mon, 21 Oct 2024 13:34:11 -0700 Subject: [PATCH 281/511] [llvm][x64] Mark win x64 SEH pseudo instruction as meta instructions (again) (#112962) When adding new SEH pseudo instructions in #110024 I noticed that some of the tests were changing their output since these new instructions were counting towards thresholds for branching versus folding decisions. These instructions do not result in real machine instructions being emitted, so they should be marked as meta instructions. This is a re-do of #110889 as we hit an issue where some of the SEH pseudo instructions in the prolog were being duplicated, which resulted errors being raised as the CodeView generator was seeing prolog directives after an end-prolog directive: . The fix for this is to mark the prolog related SEH pseudo instructions as being non-duplicatable. --- llvm/lib/Target/X86/X86InstrCompiler.td | 12 +- llvm/test/CodeGen/X86/no-dup-cv-directive.ll | 119 ++++++++++++++++++ .../CodeGen/X86/x86-win64-shrink-wrapping.ll | 24 ++-- 3 files changed, 142 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/X86/no-dup-cv-directive.ll diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 5a8177e2b360..a05c3f028442 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -235,7 +235,13 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { //===----------------------------------------------------------------------===// // Pseudo instructions used by unwind info. // -let isPseudo = 1, SchedRW = [WriteSystem] in { + +// Prolog instructions should not be duplicated, since this can cause issues +// because 1) if only some of the instructions are duplicated, then we will +// observe prolog instructions after the end-prolog instruction and 2) Windows +// expects there to only be a single prolog (e.g., when checking if unwinding +// is happening in the middle of a prolog). +let isPseudo = 1, isMeta = 1, isNotDuplicable = 1, SchedRW = [WriteSystem] in { def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), "#SEH_PushReg $reg", []>; def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), @@ -252,6 +258,10 @@ let isPseudo = 1, SchedRW = [WriteSystem] in { "#SEH_PushFrame $mode", []>; def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), "#SEH_EndPrologue", []>; +} + +// Epilog instructions: +let isPseudo = 1, isMeta = 1, SchedRW = [WriteSystem] in { def SEH_Epilogue : I<0, Pseudo, (outs), (ins), "#SEH_Epilogue", []>; } diff --git a/llvm/test/CodeGen/X86/no-dup-cv-directive.ll b/llvm/test/CodeGen/X86/no-dup-cv-directive.ll new file mode 100644 index 000000000000..98ee9cff10d1 --- /dev/null +++ b/llvm/test/CodeGen/X86/no-dup-cv-directive.ll @@ -0,0 +1,119 @@ +; RUN: llc -O3 < %s | FileCheck %s + +; Regression test for https://github.com/llvm/llvm-project/pull/110889#issuecomment-2393405613 +; Marking x64 SEH instructions as meta led to cv directives being duplicated, which caused +; `cv_fpo_stackalloc` to be observed after seeing a `cv_fpo_endprologue`, which is an error. + +; Generated from the following code: +; int q; +; class b { +; public: +; b(); +; }; +; struct G { +; char n[sizeof(void *)]; +; int *i; +; int p() const { return n[0] ? *i : 1; } +; int s() const; +; }; +; int G::s() const { +; q = p(); +; b(); +; } +; To reproduce: clang -target i686-w64-mingw32 -w -c repro.cpp -O3 -g -gcodeview -emit-llvm + +; CHECK-LABEL: __ZNK1G1sEv: +; CHECK: .cv_fpo_proc __ZNK1G1sEv 0 +; CHECK: .cv_fpo_stackalloc 4 +; CHECK: .cv_fpo_endprologue +; CHECK-NOT: .cv_fpo_stackalloc +; CHECK-NOT: .cv_fpo_endprologue + +target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-w64-windows-gnu" + +%class.b = type { i8 } + +@q = dso_local local_unnamed_addr global i32 0, align 4, !dbg !0 + +; Function Attrs: mustprogress noreturn +define dso_local x86_thiscallcc noundef i32 @_ZNK1G1sEv(ptr nocapture noundef nonnull readonly align 4 dereferenceable(8) %this) local_unnamed_addr #0 align 2 !dbg !13 { +entry: + %agg.tmp.ensured = alloca %class.b, align 1 + #dbg_value(ptr %this, !30, !DIExpression(), !32) + #dbg_value(ptr %this, !33, !DIExpression(), !36) + %0 = load i8, ptr %this, align 4, !dbg !38, !tbaa !39 + %tobool.not.i = icmp eq i8 %0, 0, !dbg !38 + br i1 %tobool.not.i, label %_ZNK1G1pEv.exit, label %cond.true.i, !dbg !38 + +cond.true.i: ; preds = %entry + %i.i = getelementptr inbounds nuw i8, ptr %this, i32 4, !dbg !38 + %1 = load ptr, ptr %i.i, align 4, !dbg !38, !tbaa !42 + %2 = load i32, ptr %1, align 4, !dbg !38, !tbaa !45 + br label %_ZNK1G1pEv.exit, !dbg !38 + +_ZNK1G1pEv.exit: ; preds = %entry, %cond.true.i + %cond.i = phi i32 [ %2, %cond.true.i ], [ 1, %entry ], !dbg !38 + store i32 %cond.i, ptr @q, align 4, !dbg !47, !tbaa !45 + call x86_thiscallcc void @_ZN1bC1Ev(ptr noundef nonnull align 1 dereferenceable(1) %agg.tmp.ensured), !dbg !48 + unreachable, !dbg !48 +} + +declare dso_local x86_thiscallcc void @_ZN1bC1Ev(ptr noundef nonnull align 1 dereferenceable(1)) unnamed_addr #1 + +attributes #0 = { mustprogress noreturn "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!6, !7, !8, !9, !10, !11} +!llvm.ident = !{!12} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "q", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "repro.cpp", directory: "C:\\llvm", checksumkind: CSK_MD5, checksum: "54362b0cc0bf4b9927aafc8b00498049") +!4 = !{!0} +!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!6 = !{i32 1, !"NumRegisterParameters", i32 0} +!7 = !{i32 2, !"CodeView", i32 1} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{i32 1, !"wchar_size", i32 2} +!10 = !{i32 1, !"MaxTLSAlign", i32 65536} +!11 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!12 = !{!"clang version 20.0.0"} +!13 = distinct !DISubprogram(name: "s", linkageName: "_ZNK1G1sEv", scope: !14, file: !3, line: 12, type: !24, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, declaration: !28, retainedNodes: !29) +!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "G", file: !3, line: 6, size: 64, flags: DIFlagTypePassByValue, elements: !15, identifier: "_ZTS1G") +!15 = !{!16, !21, !23, !28} +!16 = !DIDerivedType(tag: DW_TAG_member, name: "n", scope: !14, file: !3, line: 7, baseType: !17, size: 32) +!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 32, elements: !19) +!18 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!19 = !{!20} +!20 = !DISubrange(count: 4) +!21 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !14, file: !3, line: 8, baseType: !22, size: 32, offset: 32) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) +!23 = !DISubprogram(name: "p", linkageName: "_ZNK1G1pEv", scope: !14, file: !3, line: 9, type: !24, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!24 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !25) +!25 = !{!5, !26} +!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !27, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer) +!27 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14) +!28 = !DISubprogram(name: "s", linkageName: "_ZNK1G1sEv", scope: !14, file: !3, line: 10, type: !24, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!29 = !{!30} +!30 = !DILocalVariable(name: "this", arg: 1, scope: !13, type: !31, flags: DIFlagArtificial | DIFlagObjectPointer) +!31 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !27, size: 32) +!32 = !DILocation(line: 0, scope: !13) +!33 = !DILocalVariable(name: "this", arg: 1, scope: !34, type: !31, flags: DIFlagArtificial | DIFlagObjectPointer) +!34 = distinct !DISubprogram(name: "p", linkageName: "_ZNK1G1pEv", scope: !14, file: !3, line: 9, type: !24, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, declaration: !23, retainedNodes: !35) +!35 = !{!33} +!36 = !DILocation(line: 0, scope: !34, inlinedAt: !37) +!37 = distinct !DILocation(line: 13, scope: !13) +!38 = !DILocation(line: 9, scope: !34, inlinedAt: !37) +!39 = !{!40, !40, i64 0} +!40 = !{!"omnipotent char", !41, i64 0} +!41 = !{!"Simple C++ TBAA"} +!42 = !{!43, !44, i64 4} +!43 = !{!"_ZTS1G", !40, i64 0, !44, i64 4} +!44 = !{!"any pointer", !40, i64 0} +!45 = !{!46, !46, i64 0} +!46 = !{!"int", !40, i64 0} +!47 = !DILocation(line: 13, scope: !13) +!48 = !DILocation(line: 14, scope: !13) diff --git a/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll index d52990e753d3..8309593896bf 100644 --- a/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll @@ -17,7 +17,7 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 { ; ENABLE-NEXT: .seh_pushreg %rbx ; ENABLE-NEXT: .seh_endprologue ; ENABLE-NEXT: testl %ecx, %ecx -; ENABLE-NEXT: je .LBB0_4 +; ENABLE-NEXT: je .LBB0_5 ; ENABLE-NEXT: # %bb.1: # %for.preheader ; ENABLE-NEXT: #APP ; ENABLE-NEXT: nop @@ -38,11 +38,11 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 { ; ENABLE-NEXT: nop ; ENABLE-NEXT: #NO_APP ; ENABLE-NEXT: shll $3, %eax -; ENABLE-NEXT: jmp .LBB0_5 -; ENABLE-NEXT: .LBB0_4: # %if.else +; ENABLE-NEXT: popq %rbx +; ENABLE-NEXT: retq +; ENABLE-NEXT: .LBB0_5: # %if.else ; ENABLE-NEXT: movl %edx, %eax ; ENABLE-NEXT: addl %edx, %eax -; ENABLE-NEXT: .LBB0_5: # %if.end ; ENABLE-NEXT: popq %rbx ; ENABLE-NEXT: retq ; ENABLE-NEXT: .seh_endproc @@ -53,7 +53,7 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 { ; DISABLE-NEXT: .seh_pushreg %rbx ; DISABLE-NEXT: .seh_endprologue ; DISABLE-NEXT: testl %ecx, %ecx -; DISABLE-NEXT: je .LBB0_4 +; DISABLE-NEXT: je .LBB0_5 ; DISABLE-NEXT: # %bb.1: # %for.preheader ; DISABLE-NEXT: #APP ; DISABLE-NEXT: nop @@ -74,11 +74,11 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 { ; DISABLE-NEXT: nop ; DISABLE-NEXT: #NO_APP ; DISABLE-NEXT: shll $3, %eax -; DISABLE-NEXT: jmp .LBB0_5 -; DISABLE-NEXT: .LBB0_4: # %if.else +; DISABLE-NEXT: popq %rbx +; DISABLE-NEXT: retq +; DISABLE-NEXT: .LBB0_5: # %if.else ; DISABLE-NEXT: movl %edx, %eax ; DISABLE-NEXT: addl %edx, %eax -; DISABLE-NEXT: .LBB0_5: # %if.end ; DISABLE-NEXT: popq %rbx ; DISABLE-NEXT: retq ; DISABLE-NEXT: .seh_endproc @@ -157,7 +157,7 @@ define i32 @loopInfoSaveOutsideLoop2(i32 %cond, i32 %N) #0 { ; DISABLE-NEXT: .seh_pushreg %rbx ; DISABLE-NEXT: .seh_endprologue ; DISABLE-NEXT: testl %ecx, %ecx -; DISABLE-NEXT: je .LBB1_4 +; DISABLE-NEXT: je .LBB1_5 ; DISABLE-NEXT: # %bb.1: # %for.preheader ; DISABLE-NEXT: #APP ; DISABLE-NEXT: nop @@ -178,11 +178,11 @@ define i32 @loopInfoSaveOutsideLoop2(i32 %cond, i32 %N) #0 { ; DISABLE-NEXT: nop ; DISABLE-NEXT: #NO_APP ; DISABLE-NEXT: shll $3, %eax -; DISABLE-NEXT: jmp .LBB1_5 -; DISABLE-NEXT: .LBB1_4: # %if.else +; DISABLE-NEXT: popq %rbx +; DISABLE-NEXT: retq +; DISABLE-NEXT: .LBB1_5: # %if.else ; DISABLE-NEXT: addl %edx, %edx ; DISABLE-NEXT: movl %edx, %eax -; DISABLE-NEXT: .LBB1_5: # %if.end ; DISABLE-NEXT: popq %rbx ; DISABLE-NEXT: retq ; DISABLE-NEXT: .seh_endproc -- GitLab From b6e9ba017f222b2f95237d69126281d6252bf176 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Mon, 21 Oct 2024 21:40:57 +0100 Subject: [PATCH 282/511] [FMV][AArch64] Unify features memtag and memtag2. (#112511) If we split these features in the compiler (see relevant pull request https://github.com/llvm/llvm-project/pull/109299), we would only be able to hand-write a 'memtag2' version using inline assembly since the compiler cannot generate the instructions that become available with FEAT_MTE2. However these instructions only work at Exception Level 1, so they would be unusable since FMV is a user space facility. I am therefore unifying them. Approved in ACLE as https://github.com/ARM-software/acle/pull/351 --- clang/include/clang/Basic/AttrDocs.td | 2 +- clang/lib/Basic/Targets/AArch64.cpp | 2 +- .../CodeGen/aarch64-cpu-supports-target.c | 2 +- clang/test/CodeGen/aarch64-cpu-supports.c | 15 +++++++++----- clang/test/CodeGen/aarch64-fmv-dependencies.c | 9 +++------ .../test/CodeGen/attr-target-clones-aarch64.c | 20 +++++++++---------- clang/test/CodeGen/attr-target-version.c | 16 +++++++-------- clang/test/Sema/attr-target-clones-aarch64.c | 2 +- .../builtins/cpu_model/AArch64CPUFeatures.inc | 2 +- .../builtins/cpu_model/aarch64/fmv/mrs.inc | 4 +--- .../llvm/TargetParser/AArch64CPUFeatures.inc | 2 +- llvm/lib/Target/AArch64/AArch64FMV.td | 3 +-- 12 files changed, 39 insertions(+), 40 deletions(-) diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index b1512e22ee2d..ee8126cadae2 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2669,7 +2669,7 @@ sign. For example: .. code-block:: c++ - __attribute__((target_clones("sha2+memtag2", "fcma+sve2-pmull128"))) + __attribute__((target_clones("sha2+memtag", "fcma+sve2-pmull128"))) void foo() {} For every multiversioned function a ``default`` (fallback) implementation diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index b96fab978a3f..3dbba2b4d25b 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -784,7 +784,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("sme-fa64", HasSMEFA64) .Case("sme-f16f16", HasSMEF16F16) .Case("sme-b16b16", HasSMEB16B16) - .Cases("memtag", "memtag2", HasMTE) + .Case("memtag", HasMTE) .Case("sb", HasSB) .Case("predres", HasPredRes) .Cases("ssbs", "ssbs2", HasSSBS) diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c index 28187bcf7453..5186cab92a92 100644 --- a/clang/test/CodeGen/aarch64-cpu-supports-target.c +++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c @@ -17,7 +17,7 @@ int check_all_feature() { return 7; else if (__builtin_cpu_supports("sve2-bitperm+sve2-sha3+sve2-sm4")) return 8; - else if (__builtin_cpu_supports("sme+memtag+memtag2+memtag3+sb")) + else if (__builtin_cpu_supports("sme+memtag+memtag3+sb")) return 9; else if (__builtin_cpu_supports("predres+ssbs+ssbs2+bti+ls64+ls64_v")) return 10; diff --git a/clang/test/CodeGen/aarch64-cpu-supports.c b/clang/test/CodeGen/aarch64-cpu-supports.c index 823bf369df6f..dc96c929fdf4 100644 --- a/clang/test/CodeGen/aarch64-cpu-supports.c +++ b/clang/test/CodeGen/aarch64-cpu-supports.c @@ -1,9 +1,10 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 2 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex ".*" // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s +//. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } -// CHECK-LABEL: define dso_local i32 @main -// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +//. +// CHECK-LABEL: @main( // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -17,8 +18,8 @@ // CHECK-NEXT: br label [[RETURN:%.*]] // CHECK: if.end: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 9070970929152 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 9070970929152 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 17867063951360 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17867063951360 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[IF_THEN1:%.*]], label [[IF_END2:%.*]] // CHECK: if.then1: @@ -60,3 +61,7 @@ int main(void) { return 0; } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/aarch64-fmv-dependencies.c b/clang/test/CodeGen/aarch64-fmv-dependencies.c index f4229a5d2339..9aca1b7a9daf 100644 --- a/clang/test/CodeGen/aarch64-fmv-dependencies.c +++ b/clang/test/CodeGen/aarch64-fmv-dependencies.c @@ -72,13 +72,10 @@ __attribute__((target_version("ls64"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] { __attribute__((target_version("lse"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mmemtag() #[[ATTR0:[0-9]+]] { +// CHECK: define dso_local i32 @fmv._Mmemtag() #[[memtag:[0-9]+]] { __attribute__((target_version("memtag"))) int fmv(void) { return 0; } -// CHECK: define dso_local i32 @fmv._Mmemtag2() #[[memtag2:[0-9]+]] { -__attribute__((target_version("memtag2"))) int fmv(void) { return 0; } - -// CHECK: define dso_local i32 @fmv._Mmemtag3() #[[memtag2:[0-9]+]] { +// CHECK: define dso_local i32 @fmv._Mmemtag3() #[[memtag:[0-9]+]] { __attribute__((target_version("memtag3"))) int fmv(void) { return 0; } // CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] { @@ -200,7 +197,7 @@ int caller() { // CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a" -// CHECK: attributes #[[memtag2]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a" +// CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[pmull]] = { {{.*}} "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" // CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a" diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c index 292e544139e3..c490682e310a 100644 --- a/clang/test/CodeGen/attr-target-clones-aarch64.c +++ b/clang/test/CodeGen/attr-target-clones-aarch64.c @@ -4,10 +4,10 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +mte -target-feature +bti -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-MTE-BTI int __attribute__((target_clones("lse+aes", "sve2"))) ftc(void) { return 0; } -int __attribute__((target_clones("sha2", "sha2+memtag2", " default "))) ftc_def(void) { return 1; } +int __attribute__((target_clones("sha2", "sha2+memtag", " default "))) ftc_def(void) { return 1; } int __attribute__((target_clones("sha2", "default"))) ftc_dup1(void) { return 2; } int __attribute__((target_clones("fp", "crc+dotprod"))) ftc_dup2(void) { return 3; } -int __attribute__((target_clones("memtag2", "bti"))) ftc_dup3(void) { return 4; } +int __attribute__((target_clones("memtag", "bti"))) ftc_dup3(void) { return 4; } int foo() { return ftc() + ftc_def() + ftc_dup1() + ftc_dup2() + ftc_dup3(); } @@ -90,7 +90,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@ftc_def._Mmemtag2Msha2 +// CHECK-LABEL: define {{[^@]+}}@ftc_def._MmemtagMsha2 // CHECK-SAME: () #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 @@ -105,7 +105,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @ftc_def._Mmemtag2Msha2 +// CHECK-NEXT: ret ptr @ftc_def._MmemtagMsha2 // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 4096 @@ -176,7 +176,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag2 +// CHECK-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag // CHECK-SAME: () #[[ATTR6:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 @@ -206,7 +206,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_dup3._Mmemtag2 +// CHECK-NEXT: ret ptr @ftc_dup3._Mmemtag // CHECK: resolver_else2: // CHECK-NEXT: ret ptr @ftc_dup3.default // @@ -547,7 +547,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def._Mmemtag2Msha2 +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def._MmemtagMsha2 // CHECK-MTE-BTI-SAME: () #[[ATTR2]] { // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 1 @@ -562,7 +562,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK-MTE-BTI: resolver_return: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_def._Mmemtag2Msha2 +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_def._MmemtagMsha2 // CHECK-MTE-BTI: resolver_else: // CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 4096 @@ -633,7 +633,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone -// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag2 +// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag // CHECK-MTE-BTI-SAME: () #[[ATTR5:[0-9]+]] { // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 4 @@ -663,7 +663,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK-MTE-BTI: resolver_return1: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_dup3._Mmemtag2 +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_dup3._Mmemtag // CHECK-MTE-BTI: resolver_else2: // CHECK-MTE-BTI-NEXT: ret ptr @ftc_dup3.default // diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index 22a53c82bfbf..0e2c7ad99d81 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -34,7 +34,7 @@ inline int __attribute__((target_version("rcpc+frintts"))) fmv_inline(void) { re inline int __attribute__((target_version("sve+sve-bf16"))) fmv_inline(void) { return 4; } inline int __attribute__((target_version("sve2-aes+sve2-sha3"))) fmv_inline(void) { return 5; } inline int __attribute__((target_version("sve2+sve2-pmull128+sve2-bitperm"))) fmv_inline(void) { return 9; } -inline int __attribute__((target_version("sve2-sm4+memtag2"))) fmv_inline(void) { return 10; } +inline int __attribute__((target_version("sve2-sm4+memtag"))) fmv_inline(void) { return 10; } inline int __attribute__((target_version("memtag3+rcpc3+mops"))) fmv_inline(void) { return 11; } inline int __attribute__((target_version("aes+dotprod"))) fmv_inline(void) { return 13; } inline int __attribute__((target_version("simd+fp16fml"))) fmv_inline(void) { return 14; } @@ -500,8 +500,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv._McrcMls64 // CHECK: resolver_else6: // CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 8796093022216 -// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 8796093022216 +// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 17592186044424 +// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17592186044424 // CHECK-NEXT: [[TMP19:%.*]] = and i1 true, [[TMP18]] // CHECK-NEXT: br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]] // CHECK: resolver_return7: @@ -729,7 +729,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4 +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMsve2-sm4 // CHECK-SAME: () #[[ATTR34:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 10 @@ -751,21 +751,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd -// CHECK-SAME: () #[[ATTR4]] { +// CHECK-SAME: () #[[ATTR36:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 14 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4 -// CHECK-SAME: () #[[ATTR36:[0-9]+]] { +// CHECK-SAME: () #[[ATTR37:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 15 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm -// CHECK-SAME: () #[[ATTR37:[0-9]+]] { +// CHECK-SAME: () #[[ATTR38:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 16 // @@ -826,7 +826,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: [[TMP23:%.*]] = and i1 true, [[TMP22]] // CHECK-NEXT: br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]] // CHECK: resolver_return9: -// CHECK-NEXT: ret ptr @fmv_inline._Mmemtag2Msve2-sm4 +// CHECK-NEXT: ret ptr @fmv_inline._MmemtagMsve2-sm4 // CHECK: resolver_else10: // CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 1236950581248 diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c index 2765c06c68fb..191ca9ba9650 100644 --- a/clang/test/Sema/attr-target-clones-aarch64.c +++ b/clang/test/Sema/attr-target-clones-aarch64.c @@ -22,7 +22,7 @@ int __attribute__((target_clones("rng", "fp16fml+fp", "default"))) redecl4(void) // expected-error@+3 {{'target_clones' attribute does not match previous declaration}} // expected-note@-2 {{previous declaration is here}} // expected-warning@+1 {{version list contains entries that don't impact code generation}} -int __attribute__((target_clones("dgh+memtag+rpres", "ebf16+dpb", "default"))) redecl4(void) { return 1; } +int __attribute__((target_clones("dgh+rpres", "ebf16+dpb", "default"))) redecl4(void) { return 1; } int __attribute__((target_version("flagm2"))) redef2(void) { return 1; } // expected-error@+2 {{multiversioned function redeclarations require identical target attributes}} diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc index bb1875fe9f72..7af7f42517d1 100644 --- a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc +++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc @@ -65,7 +65,7 @@ enum CPUFeatures { FEAT_SVE_SHA3, FEAT_SVE_SM4, FEAT_SME, - FEAT_MEMTAG, + RESERVED_FEAT_MEMTAG, // previously used and now ABI legacy FEAT_MEMTAG2, FEAT_MEMTAG3, FEAT_SB, diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc index a9befd7f3e56..0f1e9dd85303 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc @@ -45,10 +45,8 @@ static void __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_SB); if (hwcap & HWCAP_SSBS) setCPUFeature(FEAT_SSBS2); - if (hwcap2 & HWCAP2_MTE) { - setCPUFeature(FEAT_MEMTAG); + if (hwcap2 & HWCAP2_MTE) setCPUFeature(FEAT_MEMTAG2); - } if (hwcap2 & HWCAP2_MTE3) setCPUFeature(FEAT_MEMTAG3); if (hwcap2 & HWCAP2_SVEAES) diff --git a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc index bb1875fe9f72..7af7f42517d1 100644 --- a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc +++ b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc @@ -65,7 +65,7 @@ enum CPUFeatures { FEAT_SVE_SHA3, FEAT_SVE_SM4, FEAT_SME, - FEAT_MEMTAG, + RESERVED_FEAT_MEMTAG, // previously used and now ABI legacy FEAT_MEMTAG2, FEAT_MEMTAG3, FEAT_SB, diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td index c063d3f38ba1..5674e4dbd56b 100644 --- a/llvm/lib/Target/AArch64/AArch64FMV.td +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -60,8 +60,7 @@ def : FMVExtension<"i8mm", "FEAT_I8MM", "+i8mm", 270>; def : FMVExtension<"jscvt", "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>; def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", "+ls64", 520>; def : FMVExtension<"lse", "FEAT_LSE", "+lse", 80>; -def : FMVExtension<"memtag", "FEAT_MEMTAG", "", 440>; -def : FMVExtension<"memtag2", "FEAT_MEMTAG2", "+mte", 450>; +def : FMVExtension<"memtag", "FEAT_MEMTAG2", "+mte", 440>; def : FMVExtension<"memtag3", "FEAT_MEMTAG3", "+mte", 460>; def : FMVExtension<"mops", "FEAT_MOPS", "+mops", 650>; def : FMVExtension<"pmull", "FEAT_PMULL", "+aes,+fp-armv8,+neon", 160>; -- GitLab From 34d4f660fe57132d17d2e37b72ccfc1d07269de9 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Mon, 21 Oct 2024 13:43:55 -0700 Subject: [PATCH 283/511] [mlir] Fix the emission of `prop-dict` when operations have no properties (#112851) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an operation has no properties, no property struct is emitted. To avoid a compilation error, we should also skip emitting `setPropertiesFromParsedAttr`, `parseProperties` and `printProperties` in such cases. Compilation error: ``` error: ‘Properties’ has not been declared static ::llvm::LogicalResult setPropertiesFromParsedAttr(Properties &prop, ::mlir::Attribute attr, ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError); ``` --- mlir/test/IR/properties.mlir | 4 +++ mlir/test/lib/Dialect/Test/TestOps.td | 5 ++++ mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 2 +- mlir/tools/mlir-tblgen/OpFormatGen.cpp | 31 +++++++++++---------- mlir/tools/mlir-tblgen/OpFormatGen.h | 3 +- 5 files changed, 29 insertions(+), 16 deletions(-) diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir index 418b81dcbb03..9a1c49cb7dab 100644 --- a/mlir/test/IR/properties.mlir +++ b/mlir/test/IR/properties.mlir @@ -19,6 +19,10 @@ test.with_nice_properties "foo bar" is -3 // GENERIC-SAME: <{prop = "content for properties"}> : () -> () test.with_wrapped_properties <{prop = "content for properties"}> +// CHECK: test.empty_properties +// GENERIC: "test.empty_properties"() +test.empty_properties + // CHECK: test.using_property_in_custom // CHECK-SAME: [1, 4, 20]{{$}} // GENERIC: "test.using_property_in_custom"() diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 9e19966414d1..bc6c6cf213ea 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -3006,6 +3006,11 @@ def TestOpWithWrappedProperties : TEST_Op<"with_wrapped_properties"> { ); } +def TestOpWithEmptyProperties : TEST_Op<"empty_properties"> { + let assemblyFormat = "prop-dict attr-dict"; + let arguments = (ins); +} + def TestOpUsingPropertyInCustom : TEST_Op<"using_property_in_custom"> { let assemblyFormat = "custom($prop) attr-dict"; let arguments = (ins IntArrayProperty<"int64_t">:$prop); diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 71fa5011a476..dea6fb209863 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -1106,7 +1106,7 @@ OpEmitter::OpEmitter(const Operator &op, genFolderDecls(); genTypeInterfaceMethods(); genOpInterfaceMethods(); - generateOpFormat(op, opClass); + generateOpFormat(op, opClass, emitHelper.hasProperties()); genSideEffectInterfaceMethods(); } void OpEmitter::emitDecl( diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index c99c71572bec..3bf6f2f6d381 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -339,10 +339,8 @@ struct OperationFormat { Optional }; - OperationFormat(const Operator &op) - : useProperties(op.getDialect().usePropertiesForAttributes() && - !op.getAttributes().empty()), - opCppClassName(op.getCppClassName()) { + OperationFormat(const Operator &op, bool hasProperties) + : useProperties(hasProperties), opCppClassName(op.getCppClassName()) { operandTypes.resize(op.getNumOperands(), TypeResolution()); resultTypes.resize(op.getNumResults(), TypeResolution()); @@ -397,7 +395,7 @@ struct OperationFormat { /// A flag indicating if this operation has the SingleBlock trait. bool hasSingleBlockTrait; - /// Indicate whether attribute are stored in properties. + /// Indicate whether we need to use properties for the current operator. bool useProperties; /// Indicate whether prop-dict is used in the format @@ -1275,8 +1273,8 @@ static void genAttrParser(AttributeVariable *attr, MethodBody &body, // 'prop-dict' dictionary attr. static void genParsedAttrPropertiesSetter(OperationFormat &fmt, Operator &op, OpClass &opClass) { - // Not required unless 'prop-dict' is present. - if (!fmt.hasPropDict) + // Not required unless 'prop-dict' is present or we are not using properties. + if (!fmt.hasPropDict || !fmt.useProperties) return; SmallVector paramList; @@ -1621,8 +1619,10 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body, body.unindent() << "}\n"; body.unindent(); } else if (isa(element)) { - body << " if (parseProperties(parser, result))\n" - << " return ::mlir::failure();\n"; + if (useProperties) { + body << " if (parseProperties(parser, result))\n" + << " return ::mlir::failure();\n"; + } } else if (auto *customDir = dyn_cast(element)) { genCustomDirectiveParser(customDir, body, useProperties, opCppClassName); } else if (isa(element)) { @@ -2047,9 +2047,11 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op, } } - body << " _odsPrinter << \" \";\n" - << " printProperties(this->getContext(), _odsPrinter, " - "getProperties(), elidedProps);\n"; + if (fmt.useProperties) { + body << " _odsPrinter << \" \";\n" + << " printProperties(this->getContext(), _odsPrinter, " + "getProperties(), elidedProps);\n"; + } } /// Generate the printer for the 'attr-dict' directive. @@ -3771,7 +3773,8 @@ LogicalResult OpFormatParser::verifyOptionalGroupElement(SMLoc loc, // Interface //===----------------------------------------------------------------------===// -void mlir::tblgen::generateOpFormat(const Operator &constOp, OpClass &opClass) { +void mlir::tblgen::generateOpFormat(const Operator &constOp, OpClass &opClass, + bool hasProperties) { // TODO: Operator doesn't expose all necessary functionality via // the const interface. Operator &op = const_cast(constOp); @@ -3782,7 +3785,7 @@ void mlir::tblgen::generateOpFormat(const Operator &constOp, OpClass &opClass) { llvm::SourceMgr mgr; mgr.AddNewSourceBuffer( llvm::MemoryBuffer::getMemBuffer(op.getAssemblyFormat()), SMLoc()); - OperationFormat format(op); + OperationFormat format(op, hasProperties); OpFormatParser parser(mgr, format, op); FailureOr> elements = parser.parse(); if (failed(elements)) { diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.h b/mlir/tools/mlir-tblgen/OpFormatGen.h index 88dbc99d9f78..5e43f3849866 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.h +++ b/mlir/tools/mlir-tblgen/OpFormatGen.h @@ -20,7 +20,8 @@ class OpClass; class Operator; // Generate the assembly format for the given operator. -void generateOpFormat(const Operator &constOp, OpClass &opClass); +void generateOpFormat(const Operator &constOp, OpClass &opClass, + bool hasProperties); } // namespace tblgen } // namespace mlir -- GitLab From 28a2f57c98431e71f62ce524481a1356a87b5696 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Mon, 21 Oct 2024 14:01:29 -0700 Subject: [PATCH 284/511] [bazel] Pass --build_runfile_links=false (#113221) This improves performance of doing a `bazel test @llvm-project//...` a lot because previously every lit test would have some symlink tree configured for it. --- utils/bazel/.bazelrc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc index d5da8fc6a2fb..2628352f5162 100644 --- a/utils/bazel/.bazelrc +++ b/utils/bazel/.bazelrc @@ -48,6 +48,13 @@ common --incompatible_disallow_empty_glob # TODO: Remove once we move to bazel 7.x build --experimental_cc_shared_library +# Disabling runfiles links drastically increases performance in slow disk IO +# situations Do not build runfile trees by default. If an execution strategy +# relies on runfile symlink tree, the tree is created on-demand. See: +# https://github.com/bazelbuild/bazel/issues/6627 and +# https://github.com/bazelbuild/bazel/commit/03246077f948f2790a83520e7dccc2625650e6df +build --build_runfile_links=false + ############################################################################### # Options to select different strategies for linking potential dependent # libraries. The default leaves it disabled. -- GitLab From 9de0566fcf90dd838558c57df339fbfcc33fe36c Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 21 Oct 2024 14:03:34 -0700 Subject: [PATCH 285/511] [NFC][TableGen] Delete unused class member (#113165) Delete unused class member in `SearchableTableEmitter` class. --- llvm/utils/TableGen/SearchableTableEmitter.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp index 4bf4df692acb..91fde0c66305 100644 --- a/llvm/utils/TableGen/SearchableTableEmitter.cpp +++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp @@ -96,7 +96,6 @@ struct GenericTable { class SearchableTableEmitter { const RecordKeeper &Records; std::unique_ptr Target; - std::unique_ptr Intrinsics; std::vector> Enums; DenseMap EnumMap; std::set PreprocessorGuards; -- GitLab From 74e1554d7b4013a975cf5fb8df64a6419bb14a45 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 21 Oct 2024 14:03:49 -0700 Subject: [PATCH 286/511] [lldb] Fix the sorting function for diagnostics (#113220) --- lldb/source/Utility/DiagnosticsRendering.cpp | 4 ++-- lldb/unittests/Utility/DiagnosticsRenderingTest.cpp | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp index d28a9ab8958b..208733ffc868 100644 --- a/lldb/source/Utility/DiagnosticsRendering.cpp +++ b/lldb/source/Utility/DiagnosticsRendering.cpp @@ -99,10 +99,10 @@ void RenderDiagnosticDetails(Stream &stream, // Sort the diagnostics. auto sort = [](auto &ds) { - llvm::sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) { + std::stable_sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) { auto l1 = d1.source_location.value_or(DiagnosticDetail::SourceLocation{}); auto l2 = d2.source_location.value_or(DiagnosticDetail::SourceLocation{}); - return std::pair(l1.line, l2.column) < std::pair(l1.line, l2.column); + return std::tie(l1.line, l1.column) < std::tie(l2.line, l2.column); }); }; sort(remaining_details); diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp index 39d8b1d55842..ad2ebf7ffe1e 100644 --- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp +++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp @@ -46,16 +46,20 @@ TEST_F(ErrorDisplayTest, RenderStatus) { std::string result = Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"}, DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}}); - ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X")); + // Unintuitively the later diagnostic appears first in the string: + // ^ ^ + // | second + // first + ASSERT_GT(StringRef(result).find("Y"), StringRef(result).find("X")); } { // Test that diagnostics in reverse order are emitted correctly. - SourceLocation loc1 = {FileSpec{"a.c"}, 2, 10, 0, false, true}; + SourceLocation loc1 = {FileSpec{"a.c"}, 1, 10, 0, false, true}; SourceLocation loc2 = {FileSpec{"a.c"}, 1, 20, 0, false, true}; std::string result = Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"}, DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}}); - ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X")); + ASSERT_GT(StringRef(result).find("Y"), StringRef(result).find("X")); } { // Test that range diagnostics are emitted correctly. -- GitLab From 7b703bd3e7c1f8fa2274978679664d41673cdea1 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 21 Oct 2024 22:19:26 +0100 Subject: [PATCH 287/511] [lldb][docs] Fix "Developing LLDB" table of contents (#113166) Currently all the headings marked as `#` show up as a top-level entry in the `Developing LLDB` toctree. This patch marks these as `##` so only `Adding Programming Language Support` is displayed in the table of contents. --- lldb/docs/resources/addinglanguagesupport.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lldb/docs/resources/addinglanguagesupport.md b/lldb/docs/resources/addinglanguagesupport.md index 28789048643d..524d0b5f2940 100644 --- a/lldb/docs/resources/addinglanguagesupport.md +++ b/lldb/docs/resources/addinglanguagesupport.md @@ -49,21 +49,21 @@ clearer that evaluation with the static `Module`-returned `TypeSystem` instances make no sense, and have them error out on those calls. But either approach is fine. -# Creating Types +## Creating Types Your `TypeSystem` will need an approach for creating types based on a set of `Module`s. If your type info is going to come from DWARF info, you will want to subclass [DWARFASTParser](https://github.com/llvm/llvm-project/blob/main/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h). -# Add Expression Evaluation Support +## Add Expression Evaluation Support Expression Evaluation support is enabled by implementing the relevant methods on a `TypeSystem`-derived class. Search for `Expression` in the [TypeSystem header](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Symbol/TypeSystem.h) to find the methods to implement. -# Type Completion +## Type Completion There are three levels of type completion, each requiring more type information: 1. Pointer size: When you have a forward decl or a reference, and that's all you @@ -76,7 +76,7 @@ There are three levels of type completion, each requiring more type information: Ensure you never complete more of a type than is needed for a given situation. This will keep your type system from doing more work than necessary. -# Language and LanguageRuntime Plugins +## Language and LanguageRuntime Plugins If you followed the steps outlined above, you already have taught LLDB a great deal about your language. If your language's runtime model and fundamental data @@ -92,4 +92,4 @@ These tasks are covered by two plugins: information (for example dynamic type resolution). * a `Language` plugin, which provides LLDB with a static view of your language; questions that are statically knowable and do not require a process are - answered by this plugin (for example data formatters). \ No newline at end of file + answered by this plugin (for example data formatters). -- GitLab From 9b7be3ebe5c15ff43cfb5232a572289a83f20294 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 21 Oct 2024 14:42:56 -0700 Subject: [PATCH 288/511] [ORC] skip reoptimization tests on s390x. (#112796) The test was failing on s390x with this error: JIT session error: Unsupported target machine architecture in ELF object

-jitted-objectbuffer --- llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp index 4f0850022cf1..db2b767607b9 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp @@ -43,6 +43,10 @@ protected: if (Triple.isOSBinFormatCOFF() && Triple.isAArch64()) GTEST_SKIP(); + // SystemZ is not supported yet. + if (Triple.isSystemZ()) + GTEST_SKIP(); + if (Triple.isPPC()) GTEST_SKIP(); -- GitLab From 6c4267fb1779bc5550bb413f33250f9365acfbc6 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Mon, 21 Oct 2024 15:04:06 -0700 Subject: [PATCH 289/511] [libcxx][libc] Hand in Hand PoC with from_chars (#91651) Implements std::from_chars for float and double. The implementation uses LLVM-libc to do the real parsing. Since this is the first time libc++ uses LLVM-libc there is a bit of additional infrastructure code. The patch is based on the [RFC] Project Hand In Hand (LLVM-libc/libc++ code sharing) https://discourse.llvm.org/t/rfc-project-hand-in-hand-llvm-libc-libc-code-sharing/77701 --- libc/shared/fp_bits.h | 22 + libc/shared/str_to_float.h | 27 + libc/shared/str_to_integer.h | 24 + libc/src/__support/FPUtil/FPBits.h | 12 + libc/src/__support/high_precision_decimal.h | 11 + libc/src/__support/str_to_float.h | 26 + libc/src/__support/str_to_integer.h | 11 + libc/src/__support/str_to_num_result.h | 11 + libcxx/docs/Status/Cxx17Papers.csv | 2 +- libcxx/docs/Status/Cxx2cIssues.csv | 1 + libcxx/include/CMakeLists.txt | 1 + .../__charconv/from_chars_floating_point.h | 73 + libcxx/include/__configuration/availability.h | 13 + libcxx/include/charconv | 7 + libcxx/include/module.modulemap | 1 + libcxx/lib/abi/CHANGELOG.TXT | 7 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...bcxxabi.v1.stable.exceptions.nonew.abilist | 2 + ...xxabi.v1.stable.noexceptions.nonew.abilist | 4 +- libcxx/src/CMakeLists.txt | 9 +- libcxx/src/charconv.cpp | 12 + .../src/include/from_chars_floating_point.h | 457 +++++ .../floating_point.pass.cpp | 1560 +++++++++++++++++ .../utilities/charconv/charconv.msvc/test.cpp | 34 +- .../charconv/charconv.msvc/test.pass.cpp | 4 + libcxx/test/support/charconv_test_helpers.h | 2 + libcxx/test/support/test_macros.h | 4 + libcxx/utils/libcxx/test/features.py | 8 + .../cmake/Modules/FindLibcCommonUtils.cmake | 14 + .../llvm-project-overlay/libc/BUILD.bazel | 13 + 36 files changed, 2365 insertions(+), 21 deletions(-) create mode 100644 libc/shared/fp_bits.h create mode 100644 libc/shared/str_to_float.h create mode 100644 libc/shared/str_to_integer.h create mode 100644 libcxx/include/__charconv/from_chars_floating_point.h create mode 100644 libcxx/src/include/from_chars_floating_point.h create mode 100644 libcxx/test/std/utilities/charconv/charconv.from.chars/floating_point.pass.cpp create mode 100644 runtimes/cmake/Modules/FindLibcCommonUtils.cmake diff --git a/libc/shared/fp_bits.h b/libc/shared/fp_bits.h new file mode 100644 index 000000000000..2898c508b777 --- /dev/null +++ b/libc/shared/fp_bits.h @@ -0,0 +1,22 @@ +//===-- Floating point number utils -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_FP_BITS_H +#define LLVM_LIBC_SHARED_FP_BITS_H + +#include "src/__support/FPUtil/FPBits.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using fputil::FPBits; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_FP_BITS_H diff --git a/libc/shared/str_to_float.h b/libc/shared/str_to_float.h new file mode 100644 index 000000000000..b133a28e26ef --- /dev/null +++ b/libc/shared/str_to_float.h @@ -0,0 +1,27 @@ +//===-- String to float conversion utils ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_STR_TO_FLOAT_H +#define LLVM_LIBC_SHARED_STR_TO_FLOAT_H + +#include "src/__support/str_to_float.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using internal::ExpandedFloat; +using internal::FloatConvertReturn; +using internal::RoundDirection; + +using internal::binary_exp_to_float; +using internal::decimal_exp_to_float; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_STR_TO_FLOAT_H diff --git a/libc/shared/str_to_integer.h b/libc/shared/str_to_integer.h new file mode 100644 index 000000000000..15bee698d5a6 --- /dev/null +++ b/libc/shared/str_to_integer.h @@ -0,0 +1,24 @@ +//===-- String to int conversion utils --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_STR_TO_INTEGER_H +#define LLVM_LIBC_SHARED_STR_TO_INTEGER_H + +#include "src/__support/str_to_integer.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using LIBC_NAMESPACE::StrToNumResult; + +using internal::strtointeger; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_STR_TO_INTEGER_H diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h index 5d1f633bb56e..6da89091a8ce 100644 --- a/libc/src/__support/FPUtil/FPBits.h +++ b/libc/src/__support/FPUtil/FPBits.h @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This file is shared with libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- + #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FPBITS_H #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FPBITS_H @@ -795,6 +801,12 @@ template LIBC_INLINE static constexpr FPType get_fp_type() { static_assert(cpp::always_false, "Unsupported type"); } +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- // A generic class to manipulate C++ floating point formats. // It derives its functionality to FPRepImpl above. template diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h index 3e397574d4cb..ac11649d1d16 100644 --- a/libc/src/__support/high_precision_decimal.h +++ b/libc/src/__support/high_precision_decimal.h @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This file is shared with libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- + #ifndef LLVM_LIBC_SRC___SUPPORT_HIGH_PRECISION_DECIMAL_H #define LLVM_LIBC_SRC___SUPPORT_HIGH_PRECISION_DECIMAL_H @@ -23,6 +29,11 @@ struct LShiftTableEntry { char const *power_of_five; }; +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- // This is used in both this file and in the main str_to_float.h. // TODO: Figure out where to put this. enum class RoundDirection { Up, Down, Nearest }; diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h index a452b3a55fde..91569af5cb76 100644 --- a/libc/src/__support/str_to_float.h +++ b/libc/src/__support/str_to_float.h @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This file is shared with libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- + #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_FLOAT_H #define LLVM_LIBC_SRC___SUPPORT_STR_TO_FLOAT_H @@ -32,11 +38,21 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- template struct ExpandedFloat { typename fputil::FPBits::StorageType mantissa; int32_t exponent; }; +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- template struct FloatConvertReturn { ExpandedFloat num = {0, 0}; int error = 0; @@ -637,6 +653,11 @@ template <> LIBC_INLINE constexpr int32_t get_lower_bound() { return -(309 + 15 + 20); } +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- // Takes a mantissa and base 10 exponent and converts it into its closest // floating point type T equivalient. First we try the Eisel-Lemire algorithm, // then if that fails then we fall back to a more accurate algorithm for @@ -716,6 +737,11 @@ LIBC_INLINE FloatConvertReturn decimal_exp_to_float( return output; } +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- // Takes a mantissa and base 2 exponent and converts it into its closest // floating point type T equivalient. Since the exponent is already in the right // form, this is mostly just shifting and rounding. This is used for hexadecimal diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h index c8d02434c89c..86611f9a6902 100644 --- a/libc/src/__support/str_to_integer.h +++ b/libc/src/__support/str_to_integer.h @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This file is shared with libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- + #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_INTEGER_H #define LLVM_LIBC_SRC___SUPPORT_STR_TO_INTEGER_H @@ -73,6 +79,11 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) { return 10; } +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- // Takes a pointer to a string and the base to convert to. This function is used // as the backend for all of the string to int functions. template diff --git a/libc/src/__support/str_to_num_result.h b/libc/src/__support/str_to_num_result.h index 6d361357cac2..48c363c88ff4 100644 --- a/libc/src/__support/str_to_num_result.h +++ b/libc/src/__support/str_to_num_result.h @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This file is shared with libc++. You should also be careful when adding +// dependencies to this file, since it needs to build for all libc++ targets. +// ----------------------------------------------------------------------------- + #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_NUM_RESULT_H #define LLVM_LIBC_SRC___SUPPORT_STR_TO_NUM_RESULT_H @@ -16,6 +22,11 @@ namespace LIBC_NAMESPACE_DECL { +// ----------------------------------------------------------------------------- +// **** WARNING **** +// This interface is shared with libc++, if you change this interface you need +// to update it in both libc and libc++. +// ----------------------------------------------------------------------------- template struct StrToNumResult { T value; int error; diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv index 3b56807312d5..7714f41ca19e 100644 --- a/libcxx/docs/Status/Cxx17Papers.csv +++ b/libcxx/docs/Status/Cxx17Papers.csv @@ -71,7 +71,7 @@ "`P0394R4 `__","Hotel Parallelifornia: terminate() for Parallel Algorithms Exception Handling","2016-06 (Oulu)","|Complete|","17.0","" "","","","","","" "`P0003R5 `__","Removing Deprecated Exception Specifications from C++17","2016-11 (Issaquah)","|Complete|","5.0","" -"`P0067R5 `__","Elementary string conversions, revision 5","2016-11 (Issaquah)","|Partial|","","``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``." +"`P0067R5 `__","Elementary string conversions, revision 5","2016-11 (Issaquah)","|Partial|","","``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``. ``std::from_chars`` for ``float`` and ``double`` since version 20.0." "`P0403R1 `__","Literal suffixes for ``basic_string_view``\ ","2016-11 (Issaquah)","|Complete|","4.0","" "`P0414R2 `__","Merging shared_ptr changes from Library Fundamentals to C++17","2016-11 (Issaquah)","|Complete|","11.0","" "`P0418R2 `__","Fail or succeed: there is no atomic lattice","2016-11 (Issaquah)","","","" diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index a62c4992020a..19572c655ecd 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -78,4 +78,5 @@ "","","","","","" "`LWG3343 `__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16.0","" "`LWG4139 `__","§[time.zone.leap] recursive constraint in <=>","Not Adopted Yet","|Complete|","20.0","" +"`LWG3456 `__","Pattern used by std::from_chars is underspecified (option B)",,"Not Yet Adopted","|Complete|","20.0","" "","","","","","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a107314518b1..86d2fc2c2c67 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -235,6 +235,7 @@ set(files __bit/rotate.h __bit_reference __charconv/chars_format.h + __charconv/from_chars_floating_point.h __charconv/from_chars_integral.h __charconv/from_chars_result.h __charconv/tables.h diff --git a/libcxx/include/__charconv/from_chars_floating_point.h b/libcxx/include/__charconv/from_chars_floating_point.h new file mode 100644 index 000000000000..2860b0e8da83 --- /dev/null +++ b/libcxx/include/__charconv/from_chars_floating_point.h @@ -0,0 +1,73 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___CHARCONV_FROM_CHARS_FLOATING_POINT_H +#define _LIBCPP___CHARCONV_FROM_CHARS_FLOATING_POINT_H + +#include <__assert> +#include <__charconv/chars_format.h> +#include <__charconv/from_chars_result.h> +#include <__config> +#include <__system_error/errc.h> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if _LIBCPP_STD_VER >= 17 + +template +struct __from_chars_result { + _Fp __value; + ptrdiff_t __n; + errc __ec; +}; + +template +_LIBCPP_EXPORTED_FROM_ABI __from_chars_result<_Fp> __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt); + +extern template __from_chars_result __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt); + +extern template __from_chars_result __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt); + +template +_LIBCPP_HIDE_FROM_ABI from_chars_result +__from_chars(const char* __first, const char* __last, _Fp& __value, chars_format __fmt) { + __from_chars_result<_Fp> __r = std::__from_chars_floating_point<_Fp>(__first, __last, __fmt); + if (__r.__ec != errc::invalid_argument) + __value = __r.__value; + return {__first + __r.__n, __r.__ec}; +} + +_LIBCPP_AVAILABILITY_FROM_CHARS_FLOATING_POINT _LIBCPP_HIDE_FROM_ABI inline from_chars_result +from_chars(const char* __first, const char* __last, float& __value, chars_format __fmt = chars_format::general) { + return std::__from_chars(__first, __last, __value, __fmt); +} + +_LIBCPP_AVAILABILITY_FROM_CHARS_FLOATING_POINT _LIBCPP_HIDE_FROM_ABI inline from_chars_result +from_chars(const char* __first, const char* __last, double& __value, chars_format __fmt = chars_format::general) { + return std::__from_chars(__first, __last, __value, __fmt); +} + +#endif // _LIBCPP_STD_VER >= 17 + +_LIBCPP_END_NAMESPACE_STD + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___CHARCONV_FROM_CHARS_FLOATING_POINT_H diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h index f42ff460db45..173999c46807 100644 --- a/libcxx/include/__configuration/availability.h +++ b/libcxx/include/__configuration/availability.h @@ -87,6 +87,9 @@ // in all versions of the library are available. #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) +# define _LIBCPP_INTRODUCED_IN_LLVM_20 1 +# define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE /* nothing */ + # define _LIBCPP_INTRODUCED_IN_LLVM_19 1 # define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */ @@ -132,6 +135,11 @@ // clang-format off +// LLVM 20 +// TODO: Fill this in +# define _LIBCPP_INTRODUCED_IN_LLVM_20 0 +# define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE __attribute__((unavailable)) + // LLVM 19 // TODO: Fill this in # define _LIBCPP_INTRODUCED_IN_LLVM_19 0 @@ -409,6 +417,11 @@ #define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19 #define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE +// This controls the availability of floating-point std::from_chars functions. +// These overloads were added later than the integer overloads. +#define _LIBCPP_AVAILABILITY_HAS_FROM_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_20 +#define _LIBCPP_AVAILABILITY_FROM_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE + // Define availability attributes that depend on _LIBCPP_HAS_EXCEPTIONS. // Those are defined in terms of the availability attributes above, and // should not be vendor-specific. diff --git a/libcxx/include/charconv b/libcxx/include/charconv index a2e270e9316d..29c6875008ab 100644 --- a/libcxx/include/charconv +++ b/libcxx/include/charconv @@ -65,6 +65,12 @@ namespace std { constexpr from_chars_result from_chars(const char* first, const char* last, see below& value, int base = 10); // constexpr since C++23 + from_chars_result from_chars(const char* first, const char* last, + float& value, chars_format fmt); + + from_chars_result from_chars(const char* first, const char* last, + double& value, chars_format fmt); + } // namespace std */ @@ -73,6 +79,7 @@ namespace std { #if _LIBCPP_STD_VER >= 17 # include <__charconv/chars_format.h> +# include <__charconv/from_chars_floating_point.h> # include <__charconv/from_chars_integral.h> # include <__charconv/from_chars_result.h> # include <__charconv/tables.h> diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 06e93d245290..d775da489e35 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -898,6 +898,7 @@ module std [system] { module charconv { module chars_format { header "__charconv/chars_format.h" } + module from_chars_floating_point { header "__charconv/from_chars_floating_point.h" } module from_chars_integral { header "__charconv/from_chars_integral.h" } module from_chars_result { header "__charconv/from_chars_result.h" } module tables { header "__charconv/tables.h" } diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT index 6911694b75d8..e27eb5fa046f 100644 --- a/libcxx/lib/abi/CHANGELOG.TXT +++ b/libcxx/lib/abi/CHANGELOG.TXT @@ -16,6 +16,13 @@ New entries should be added directly below the "Version" header. Version 20.0 ------------ +* [libcxx][libc] Implements from_chars floating-point + + All platforms + ------------- + Symbol added: _ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE + Symbol added: _ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE + * [libc++] Stop trying to avoid exporting some typeinfo names This patch removes the explicit list of symbols to avoid exporting diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index db77e1d0ac30..79f999b3e02b 100644 --- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1584,6 +1584,8 @@ {'is_defined': True, 'name': '__ZNSt3__123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIxNS_22__cxx_atomic_base_implIxEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index 8af5db472f7c..9efdf11940a7 100644 --- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1220,6 +1220,8 @@ {'is_defined': True, 'name': '_ZNSt6__ndk123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIiNS_22__cxx_atomic_base_implIiEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk134__construct_barrier_algorithm_baseERi', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index 033d9f9987fa..7fde4b905fc5 100644 --- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1715,6 +1715,8 @@ {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEaSEOS5_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEaSEOS5_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__122__libcpp_verbose_abortEPKcz', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__14__fs10filesystem16_FilesystemClock9is_steadyE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__14__fs10filesystem4path19preferred_separatorE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__16__sortIRNS_6__lessIaaEEPaEEvT0_S5_T_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index 332d8abeb03e..da30346257f9 100644 --- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1715,6 +1715,8 @@ {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEaSEOS5_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEaSEOS5_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__122__libcpp_verbose_abortEPKcz', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__14__fs10filesystem16_FilesystemClock9is_steadyE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__14__fs10filesystem4path19preferred_separatorE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'wEXP', 'is_defined': True, 'name': '_ZNSt3__16__sortIRNS_6__lessIaaEEPaEEvT0_S5_T_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index 62716f5c415f..e1dc6e778b57 100644 --- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1584,6 +1584,8 @@ {'is_defined': True, 'name': '__ZNSt3__123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIxNS_22__cxx_atomic_base_implIxEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__127__from_chars_floating_pointEPKcS1_RdNS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNSt3__127__from_chars_floating_pointEPKcS1_RfNS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNSt3__134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index 6b77cda1e286..ceeeffe4d979 100644 --- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1220,6 +1220,8 @@ {'is_defined': True, 'name': '_ZNSt6__ndk123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIiNS_22__cxx_atomic_base_implIiEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt6__ndk127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt6__ndk134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist index 3458b333dd6a..d3670d237b23 100644 --- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1235,6 +1235,8 @@ {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIlNS_22__cxx_atomic_base_implIlEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist index bdf90ba25c7f..2c21a03d41a0 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -1233,6 +1233,8 @@ {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIiNS_22__cxx_atomic_base_implIiEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist index ac3cc129c04b..0d4c50950908 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist @@ -1204,6 +1204,8 @@ {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKNS_17__cxx_atomic_implIiNS_22__cxx_atomic_base_implIiEEEE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__123__libcpp_atomic_monitorEPVKv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__125notify_all_at_thread_exitERNS_18condition_variableENS_11unique_lockINS_5mutexEEE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIdEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNSt3__127__from_chars_floating_pointIfEENS_19__from_chars_resultIT_EEPKcS5_NS_12chars_formatE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__131__arrive_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseEh', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__132__destroy_barrier_algorithm_baseEPNS_24__barrier_algorithm_baseE', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNSt3__134__construct_barrier_algorithm_baseERl', 'type': 'FUNC'} @@ -2006,4 +2008,4 @@ {'is_defined': True, 'name': '_ZTv0_n24_NSt3__114basic_iostreamIcNS_11char_traitsIcEEED0Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZTv0_n24_NSt3__114basic_iostreamIcNS_11char_traitsIcEEED1Ev', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD0Ev', 'type': 'FUNC'} -{'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD1Ev', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD1Ev', 'type': 'FUNC'} \ No newline at end of file diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 4af04f202db1..cce8b8976f73 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -31,6 +31,7 @@ set(LIBCXX_SOURCES include/ryu/f2s.h include/ryu/ryu.h include/to_chars_floating_point.h + include/from_chars_floating_point.h legacy_pointer_safety.cpp memory.cpp memory_resource.cpp @@ -172,11 +173,14 @@ endif() split_list(LIBCXX_COMPILE_FLAGS) split_list(LIBCXX_LINK_FLAGS) +include(FindLibcCommonUtils) + # Build the shared library. add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared - PRIVATE ${LIBCXX_LIBRARIES}) + PRIVATE ${LIBCXX_LIBRARIES} + PRIVATE llvm-libc-common-utilities) set_target_properties(cxx_shared PROPERTIES EXCLUDE_FROM_ALL "$,FALSE,TRUE>" @@ -267,7 +271,8 @@ add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS}) target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static PRIVATE ${LIBCXX_LIBRARIES} - PRIVATE libcxx-abi-static) + PRIVATE libcxx-abi-static + PRIVATE llvm-libc-common-utilities) set_target_properties(cxx_static PROPERTIES EXCLUDE_FROM_ALL "$,FALSE,TRUE>" diff --git a/libcxx/src/charconv.cpp b/libcxx/src/charconv.cpp index 4fd7a2c2c0f0..3fe0afec0e28 100644 --- a/libcxx/src/charconv.cpp +++ b/libcxx/src/charconv.cpp @@ -9,6 +9,7 @@ #include #include +#include "include/from_chars_floating_point.h" #include "include/to_chars_floating_point.h" _LIBCPP_BEGIN_NAMESPACE_STD @@ -74,4 +75,15 @@ to_chars_result to_chars(char* __first, char* __last, long double __value, chars __first, __last, static_cast(__value), __fmt, __precision); } +template +__from_chars_result<_Fp> __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt) { + return std::__from_chars_floating_point_impl<_Fp>(__first, __last, __fmt); +} + +template __from_chars_result __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt); + +template __from_chars_result __from_chars_floating_point( + [[clang::noescape]] const char* __first, [[clang::noescape]] const char* __last, chars_format __fmt); _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/include/from_chars_floating_point.h b/libcxx/src/include/from_chars_floating_point.h new file mode 100644 index 000000000000..19eeeb28fb08 --- /dev/null +++ b/libcxx/src/include/from_chars_floating_point.h @@ -0,0 +1,457 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP_SRC_INCLUDE_FROM_CHARS_FLOATING_POINT_H +#define _LIBCPP_SRC_INCLUDE_FROM_CHARS_FLOATING_POINT_H + +// These headers are in the shared LLVM-libc header library. +#include "shared/fp_bits.h" +#include "shared/str_to_float.h" +#include "shared/str_to_integer.h" + +#include <__assert> +#include <__config> +#include +#include +#include +#include + +// Included for the _Floating_type_traits class +#include "to_chars_floating_point.h" + +_LIBCPP_BEGIN_NAMESPACE_STD + +// Parses an infinity string. +// Valid strings are case insensitive and contain INF or INFINITY. +// +// - __first is the first argument to std::from_chars. When the string is invalid +// this value is returned as ptr in the result. +// - __last is the last argument of std::from_chars. +// - __value is the value argument of std::from_chars, +// - __ptr is the current position is the input string. This is points beyond +// the initial I character. +// - __negative whether a valid string represents -inf or +inf. +template +__from_chars_result<_Fp> +__from_chars_floating_point_inf(const char* const __first, const char* __last, const char* __ptr, bool __negative) { + if (__last - __ptr < 2) [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + + if (std::tolower(__ptr[0]) != 'n' || std::tolower(__ptr[1]) != 'f') [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + + __ptr += 2; + + // At this point the result is valid and contains INF. + // When the remaining part contains INITY this will be consumed. Otherwise + // only INF is consumed. For example INFINITZ will consume INF and ignore + // INITZ. + + if (__last - __ptr >= 5 // + && std::tolower(__ptr[0]) == 'i' // + && std::tolower(__ptr[1]) == 'n' // + && std::tolower(__ptr[2]) == 'i' // + && std::tolower(__ptr[3]) == 't' // + && std::tolower(__ptr[4]) == 'y') + __ptr += 5; + + if constexpr (numeric_limits<_Fp>::has_infinity) { + if (__negative) + return {-std::numeric_limits<_Fp>::infinity(), __ptr - __first, std::errc{}}; + + return {std::numeric_limits<_Fp>::infinity(), __ptr - __first, std::errc{}}; + } else { + return {_Fp{0}, __ptr - __first, errc::result_out_of_range}; + } +} + +// Parses a nan string. +// Valid strings are case insensitive and contain INF or INFINITY. +// +// - __first is the first argument to std::from_chars. When the string is invalid +// this value is returned as ptr in the result. +// - __last is the last argument of std::from_chars. +// - __value is the value argument of std::from_chars, +// - __ptr is the current position is the input string. This is points beyond +// the initial N character. +// - __negative whether a valid string represents -nan or +nan. +template +__from_chars_result<_Fp> +__from_chars_floating_point_nan(const char* const __first, const char* __last, const char* __ptr, bool __negative) { + if (__last - __ptr < 2) [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + + if (std::tolower(__ptr[0]) != 'a' || std::tolower(__ptr[1]) != 'n') [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + + __ptr += 2; + + // At this point the result is valid and contains NAN. When the remaining + // part contains ( n-char-sequence_opt ) this will be consumed. Otherwise + // only NAN is consumed. For example NAN(abcd will consume NAN and ignore + // (abcd. + if (__last - __ptr >= 2 && __ptr[0] == '(') { + size_t __offset = 1; + do { + if (__ptr[__offset] == ')') { + __ptr += __offset + 1; + break; + } + if (__ptr[__offset] != '_' && !std::isalnum(__ptr[__offset])) + break; + ++__offset; + } while (__ptr + __offset != __last); + } + + if (__negative) + return {-std::numeric_limits<_Fp>::quiet_NaN(), __ptr - __first, std::errc{}}; + + return {std::numeric_limits<_Fp>::quiet_NaN(), __ptr - __first, std::errc{}}; +} + +template +struct __fractional_constant_result { + size_t __offset{size_t(-1)}; + _Tp __mantissa{0}; + int __exponent{0}; + bool __truncated{false}; + bool __is_valid{false}; +}; + +// Parses the hex constant part of the hexadecimal floating-point value. +// - input start of buffer given to from_chars +// - __n the number of elements in the buffer +// - __offset where to start parsing. The input can have an optional sign, the +// offset starts after this sign. +template +__fractional_constant_result<_Tp> __parse_fractional_hex_constant(const char* __input, size_t __n, size_t __offset) { + __fractional_constant_result<_Tp> __result; + + const _Tp __mantissa_truncate_threshold = numeric_limits<_Tp>::max() / 16; + bool __fraction = false; + for (; __offset < __n; ++__offset) { + if (std::isxdigit(__input[__offset])) { + __result.__is_valid = true; + + uint32_t __digit = __input[__offset] - '0'; + switch (std::tolower(__input[__offset])) { + case 'a': + __digit = 10; + break; + case 'b': + __digit = 11; + break; + case 'c': + __digit = 12; + break; + case 'd': + __digit = 13; + break; + case 'e': + __digit = 14; + break; + case 'f': + __digit = 15; + break; + } + + if (__result.__mantissa < __mantissa_truncate_threshold) { + __result.__mantissa = (__result.__mantissa * 16) + __digit; + if (__fraction) + __result.__exponent -= 4; + } else { + if (__digit > 0) + __result.__truncated = true; + if (!__fraction) + __result.__exponent += 4; + } + } else if (__input[__offset] == '.') { + if (__fraction) + break; // this means that __input[__offset] points to a second decimal point, ending the number. + + __fraction = true; + } else + break; + } + + __result.__offset = __offset; + return __result; +} + +struct __exponent_result { + size_t __offset{size_t(-1)}; + int __value{0}; + bool __present{false}; +}; + +// When the exponent is not present the result of the struct contains +// __offset, 0, false. This allows using the results unconditionally, the +// __present is important for the scientific notation, where the value is +// mandatory. +__exponent_result __parse_exponent(const char* __input, size_t __n, size_t __offset, char __marker) { + if (__offset + 1 < __n && // an exponent always needs at least one digit. + std::tolower(__input[__offset]) == __marker && // + !std::isspace(__input[__offset + 1]) // leading whitespace is not allowed. + ) { + ++__offset; + LIBC_NAMESPACE::shared::StrToNumResult __e = + LIBC_NAMESPACE::shared::strtointeger(__input + __offset, 10, __n - __offset); + // __result.error contains the errno value, 0 or ERANGE these are not interesting. + // If the number of characters parsed is 0 it means there was no number. + if (__e.parsed_len != 0) + return {__offset + __e.parsed_len, __e.value, true}; + else + --__offset; // the assumption of a valid exponent was not true, undo eating the exponent character. + } + + return {__offset, 0, false}; +} + +// Here we do this operation as int64 to avoid overflow. +int32_t __merge_exponents(int64_t __fractional, int64_t __exponent, int __max_biased_exponent) { + int64_t __sum = __fractional + __exponent; + + if (__sum > __max_biased_exponent) + return __max_biased_exponent; + + if (__sum < -__max_biased_exponent) + return -__max_biased_exponent; + + return __sum; +} + +template +__from_chars_result<_Fp> +__calculate_result(_Tp __mantissa, int __exponent, bool __negative, __from_chars_result<_Fp> __result) { + auto __r = LIBC_NAMESPACE::shared::FPBits<_Fp>(); + __r.set_mantissa(__mantissa); + __r.set_biased_exponent(__exponent); + + // C17 7.12.1/6 + // The result underflows if the magnitude of the mathematical result is so + // small that the mathematical result cannot be represented, without + // extraordinary roundoff error, in an object of the specified type.237) If + // the result underflows, the function returns an implementation-defined + // value whose magnitude is no greater than the smallest normalized positive + // number in the specified type; if the integer expression math_errhandling + // & MATH_ERRNO is nonzero, whether errno acquires the value ERANGE is + // implementation-defined; if the integer expression math_errhandling & + // MATH_ERREXCEPT is nonzero, whether the "underflow" floating-point + // exception is raised is implementation-defined. + // + // LLVM-LIBC sets ERAGNE for subnormal values + // + // [charconv.from.chars]/1 + // ... If the parsed value is not in the range representable by the type of + // value, value is unmodified and the member ec of the return value is + // equal to errc::result_out_of_range. ... + // + // Undo the ERANGE for subnormal values. + if (__result.__ec == errc::result_out_of_range && __r.is_subnormal() && !__r.is_zero()) + __result.__ec = errc{}; + + if (__negative) + __result.__value = -__r.get_val(); + else + __result.__value = __r.get_val(); + + return __result; +} + +// Implements from_chars for decimal floating-point values. +// __first forwarded from from_chars +// __last forwarded from from_chars +// __value forwarded from from_chars +// __fmt forwarded from from_chars +// __ptr the start of the buffer to parse. This is after the optional sign character. +// __negative should __value be set to a negative value? +// +// This function and __from_chars_floating_point_decimal are similar. However +// the similar parts are all in helper functions. So the amount of code +// duplication is minimal. +template +__from_chars_result<_Fp> +__from_chars_floating_point_hex(const char* const __first, const char* __last, const char* __ptr, bool __negative) { + size_t __n = __last - __first; + ptrdiff_t __offset = __ptr - __first; + + auto __fractional = + std::__parse_fractional_hex_constant::_Uint_type>(__first, __n, __offset); + if (!__fractional.__is_valid) + return {_Fp{0}, 0, errc::invalid_argument}; + + auto __parsed_exponent = std::__parse_exponent(__first, __n, __fractional.__offset, 'p'); + __offset = __parsed_exponent.__offset; + int __exponent = std::__merge_exponents( + __fractional.__exponent, __parsed_exponent.__value, LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT); + + __from_chars_result<_Fp> __result{_Fp{0}, __offset, {}}; + LIBC_NAMESPACE::shared::ExpandedFloat<_Fp> __expanded_float = {0, 0}; + if (__fractional.__mantissa != 0) { + auto __temp = LIBC_NAMESPACE::shared::binary_exp_to_float<_Fp>( + {__fractional.__mantissa, __exponent}, + __fractional.__truncated, + LIBC_NAMESPACE::shared::RoundDirection::Nearest); + __expanded_float = __temp.num; + if (__temp.error == ERANGE) { + __result.__ec = errc::result_out_of_range; + } + } + + return std::__calculate_result<_Fp>(__expanded_float.mantissa, __expanded_float.exponent, __negative, __result); +} + +// Parses the hex constant part of the decimal float value. +// - input start of buffer given to from_chars +// - __n the number of elements in the buffer +// - __offset where to start parsing. The input can have an optional sign, the +// offset starts after this sign. +template +__fractional_constant_result<_Tp> +__parse_fractional_decimal_constant(const char* __input, ptrdiff_t __n, ptrdiff_t __offset) { + __fractional_constant_result<_Tp> __result; + + const _Tp __mantissa_truncate_threshold = numeric_limits<_Tp>::max() / 10; + bool __fraction = false; + for (; __offset < __n; ++__offset) { + if (std::isdigit(__input[__offset])) { + __result.__is_valid = true; + + uint32_t __digit = __input[__offset] - '0'; + if (__result.__mantissa < __mantissa_truncate_threshold) { + __result.__mantissa = (__result.__mantissa * 10) + __digit; + if (__fraction) + --__result.__exponent; + } else { + if (__digit > 0) + __result.__truncated = true; + if (!__fraction) + ++__result.__exponent; + } + } else if (__input[__offset] == '.') { + if (__fraction) + break; // this means that __input[__offset] points to a second decimal point, ending the number. + + __fraction = true; + } else + break; + } + + __result.__offset = __offset; + return __result; +} + +// Implements from_chars for decimal floating-point values. +// __first forwarded from from_chars +// __last forwarded from from_chars +// __value forwarded from from_chars +// __fmt forwarded from from_chars +// __ptr the start of the buffer to parse. This is after the optional sign character. +// __negative should __value be set to a negative value? +template +__from_chars_result<_Fp> __from_chars_floating_point_decimal( + const char* const __first, const char* __last, chars_format __fmt, const char* __ptr, bool __negative) { + ptrdiff_t __n = __last - __first; + ptrdiff_t __offset = __ptr - __first; + + auto __fractional = + std::__parse_fractional_decimal_constant::_Uint_type>(__first, __n, __offset); + if (!__fractional.__is_valid) + return {_Fp{0}, 0, errc::invalid_argument}; + + __offset = __fractional.__offset; + + // LWG3456 Pattern used by std::from_chars is underspecified + // This changes fixed to ignore a possible exponent instead of making its + // existance an error. + int __exponent; + if (__fmt == chars_format::fixed) { + __exponent = + std::__merge_exponents(__fractional.__exponent, 0, LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT); + } else { + auto __parsed_exponent = std::__parse_exponent(__first, __n, __offset, 'e'); + if (__fmt == chars_format::scientific && !__parsed_exponent.__present) { + // [charconv.from.chars]/6.2 if fmt has chars_format::scientific set but not chars_format::fixed, + // the otherwise optional exponent part shall appear; + return {_Fp{0}, 0, errc::invalid_argument}; + } + + __offset = __parsed_exponent.__offset; + __exponent = std::__merge_exponents( + __fractional.__exponent, __parsed_exponent.__value, LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT); + } + + __from_chars_result<_Fp> __result{_Fp{0}, __offset, {}}; + LIBC_NAMESPACE::shared::ExpandedFloat<_Fp> __expanded_float = {0, 0}; + if (__fractional.__mantissa != 0) { + // This function expects to parse a positive value. This means it does not + // take a __first, __n as arguments, since __first points to '-' for + // negative values. + auto __temp = LIBC_NAMESPACE::shared::decimal_exp_to_float<_Fp>( + {__fractional.__mantissa, __exponent}, + __fractional.__truncated, + LIBC_NAMESPACE::shared::RoundDirection::Nearest, + __ptr, + __last - __ptr); + __expanded_float = __temp.num; + if (__temp.error == ERANGE) { + __result.__ec = errc::result_out_of_range; + } + } + + return std::__calculate_result(__expanded_float.mantissa, __expanded_float.exponent, __negative, __result); +} + +template +__from_chars_result<_Fp> +__from_chars_floating_point_impl(const char* const __first, const char* __last, chars_format __fmt) { + if (__first == __last) [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + + const char* __ptr = __first; + bool __negative = *__ptr == '-'; + if (__negative) { + ++__ptr; + if (__ptr == __last) [[unlikely]] + return {_Fp{0}, 0, errc::invalid_argument}; + } + + // [charconv.from.chars] + // [Note 1: If the pattern allows for an optional sign, but the string has + // no digit characters following the sign, no characters match the pattern. + // -- end note] + // This is true for integrals, floating point allows -.0 + + // [charconv.from.chars]/6.2 + // if fmt has chars_format::scientific set but not chars_format::fixed, the + // otherwise optional exponent part shall appear; + // Since INF/NAN do not have an exponent this value is not valid. + // + // LWG3456 Pattern used by std::from_chars is underspecified + // Does not address this point, but proposed option B does solve this issue, + // Both MSVC STL and libstdc++ implement this this behaviour. + switch (std::tolower(*__ptr)) { + case 'i': + return std::__from_chars_floating_point_inf<_Fp>(__first, __last, __ptr + 1, __negative); + case 'n': + if constexpr (numeric_limits<_Fp>::has_quiet_NaN) + // NOTE: The pointer passed here will be parsed in the default C locale. + // This is standard behavior (see https://eel.is/c++draft/charconv.from.chars), but may be unexpected. + return std::__from_chars_floating_point_nan<_Fp>(__first, __last, __ptr + 1, __negative); + return {_Fp{0}, 0, errc::invalid_argument}; + } + + if (__fmt == chars_format::hex) + return std::__from_chars_floating_point_hex<_Fp>(__first, __last, __ptr, __negative); + + return std::__from_chars_floating_point_decimal<_Fp>(__first, __last, __fmt, __ptr, __negative); +} + +_LIBCPP_END_NAMESPACE_STD + +#endif //_LIBCPP_SRC_INCLUDE_FROM_CHARS_FLOATING_POINT_H diff --git a/libcxx/test/std/utilities/charconv/charconv.from.chars/floating_point.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.from.chars/floating_point.pass.cpp new file mode 100644 index 000000000000..6faf0499c4c9 --- /dev/null +++ b/libcxx/test/std/utilities/charconv/charconv.from.chars/floating_point.pass.cpp @@ -0,0 +1,1560 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// XFAIL: availability-fp_from_chars-missing + +// from_chars_result from_chars(const char* first, const char* last, +// float& value, chars_format fmt = chars_format::general) +// +// from_chars_result from_chars(const char* first, const char* last, +// double& value, chars_format fmt = chars_format::general) + +#include +#include +#include +#include +#include +#include +#include + +#include "charconv_test_helpers.h" +#include "test_macros.h" + +template +void test_infinity(std::chars_format fmt) { + const char* s = "-InFiNiTyXXX"; + { // I + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 2, value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s + 1); + assert(value == F(0.25)); + } + { // In + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 3, value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s + 1); + assert(value == F(0.25)); + } + { // InF + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 4, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == std::numeric_limits::infinity()); + } + { // -InF + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 4, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == -std::numeric_limits::infinity()); + } + { // InFi + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 5, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == std::numeric_limits::infinity()); + } + { // -InFiN + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 6, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == -std::numeric_limits::infinity()); + } + { // InFiNi + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 7, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == std::numeric_limits::infinity()); + } + { // -InFiNiT + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 8, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(value == -std::numeric_limits::infinity()); + } + { // InFiNiTy + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 9, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(value == std::numeric_limits::infinity()); + } + { // -InFiNiTy + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 9, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(value == -std::numeric_limits::infinity()); + } + { // InFiNiTyXXX + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 12, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(value == std::numeric_limits::infinity()); + } + { // -InFiNiTyXXX + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 12, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(value == -std::numeric_limits::infinity()); + } +} + +template +void test_nan(std::chars_format fmt) { + { + const char* s = "-NaN(1_A)XXX"; + { // N + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 2, value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s + 1); + assert(value == F(0.25)); + } + { // Na + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 3, value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s + 1); + assert(value == F(0.25)); + } + { // NaN + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 4, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // -NaN + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 0, s + 4, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(std::signbit(value)); + } + { // NaN( + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 5, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // -NaN(1 + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 6, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(std::signbit(value)); + } + { // NaN(1_ + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 7, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // -NaN(1_A + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 8, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 4); + assert(std::isnan(value)); + assert(std::signbit(value)); + } + { // NaN(1_A) + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 9, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // -NaN(1_A) + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 9, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(std::isnan(value)); + assert(std::signbit(value)); + } + { // NaN(1_A)XXX + F value = 0.25; + std::from_chars_result result = std::from_chars(s + 1, s + 12, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // -NaN(1_A)XXX + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + 12, value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 9); + assert(std::isnan(value)); + assert(std::signbit(value)); + } + } + { + const char* s = "NaN()"; + F value = 0.25; + std::from_chars_result result = std::from_chars(s, s + std::strlen(s), value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s + 5); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + { // validates a n-char-sequences with an invalid value + std::array s = {'N', 'a', 'N', '(', ' ', ')'}; + s[4] = 'a'; + { + F value = 0.25; + std::from_chars_result result = std::from_chars(s.data(), s.data() + s.size(), value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s.data() + s.size()); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + for (auto c : "!@#$%^&*(-=+[]{}|\\;:'\",./<>?~` \t\v\r\n") { + F value = 0.25; + s[4] = c; + std::from_chars_result result = std::from_chars(s.data(), s.data() + s.size(), value, fmt); + + assert(result.ec == std::errc{}); + assert(result.ptr == s.data() + 3); + assert(std::isnan(value)); + assert(!std::signbit(value)); + } + } +} + +template +void test_fmt_independent(std::chars_format fmt) { + test_infinity(fmt); + test_nan(fmt); + + { // first == last + F value = 0.25; + std::from_chars_result result = std::from_chars(nullptr, nullptr, value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == nullptr); + assert(value == F(0.25)); + } + { // only a sign + F value = 0.25; + const char* s = "-"; + std::from_chars_result result = std::from_chars(s, s + std::strlen(s), value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s); + assert(value == F(0.25)); + } + { // only decimal separator + F value = 0.25; + const char* s = "."; + std::from_chars_result result = std::from_chars(s, s + std::strlen(s), value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s); + assert(value == F(0.25)); + } + { // sign and decimal separator + F value = 0.25; + const char* s = "-."; + std::from_chars_result result = std::from_chars(s, s + std::strlen(s), value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s); + assert(value == F(0.25)); + } + { // + sign is not allowed + F value = 0.25; + const char* s = "+0.25"; + std::from_chars_result result = std::from_chars(s, s + std::strlen(s), value, fmt); + + assert(result.ec == std::errc::invalid_argument); + assert(result.ptr == s); + assert(value == F(0.25)); + } +} + +template +struct test_basics { + void operator()() { + for (auto fmt : {std::chars_format::scientific, + std::chars_format::fixed, + /*std::chars_format::hex,*/ std::chars_format::general}) + test_fmt_independent(fmt); + } +}; + +template +struct test_fixed { + void operator()() { + std::from_chars_result r; + F x = 0.25; + + // *** Failures + + { // Starts with invalid character + std::array s = {' ', '1'}; + for (auto c : "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "`~!@#$%^&*()_=[]{}\\|;:'\",/<>? \t\v\r\n") { + s[0] = c; + r = std::from_chars(s.data(), s.data() + s.size(), x, std::chars_format::fixed); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s.data()); + assert(x == F(0.25)); + } + } + + // *** Success + + { // number followed by non-numeric values + const char* s = "001x"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.0)); + } + { // no leading digit + const char* s = ".5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 2); + assert(x == F(0.5)); + } + { // negative sign and no leading digit + const char* s = "-.5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(-0.5)); + } + + { // double deciamal point + const char* s = "1.25.78"; + + // This number is halfway between two float values. + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { // exponenent no sign + const char* s = "1.5e10"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // exponenent capitalized no sign + const char* s = "1.5E10"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // exponenent + sign + const char* s = "1.5e+10"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // exponenent - sign + const char* s = "1.5e-10"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // Exponent no number + const char* s = "1.5e"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // Exponent sign no number + { + const char* s = "1.5e+"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e-"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + } + { // Exponent with whitespace + { + const char* s = "1.5e +1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e+ 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e -1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e- 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + } + { // double exponent + const char* s = "1.25e0e12"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { // Exponent double sign + { + const char* s = "1.25e++12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e+-12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e-+12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e--12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + } + { // exponent hex prefix + const char* s = "1.25e0x12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { // This number is halfway between two float values. + const char* s = "20040229"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 8); + assert(x == F(20040229)); + } + { // Shifting mantissa exponent and no exponent + const char* s = "123.456"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.23456e2)); + } + { // Shifting mantissa exponent and an exponent + const char* s = "123.456e3"; + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(123.456)); + } + { // Mantissa overflow + { + const char* s = "0.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(0.111111111111111111111111111111111111111111)); + } + { + const char* s = "111111111111.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(111111111111.111111111111111111111111111111111111111111)); + } + } + { // Negative value + const char* s = "-0.25"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::fixed); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(-0.25)); + } + } +}; + +template +struct test_scientific { + void operator()() { + std::from_chars_result r; + F x = 0.25; + + // *** Failures + + { // Starts with invalid character + std::array s = {' ', '1', 'e', '0'}; + for (auto c : "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "`~!@#$%^&*()_=[]{}\\|;:'\",/<>? \t\v\r\n") { + s[0] = c; + r = std::from_chars(s.data(), s.data() + s.size(), x, std::chars_format::scientific); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s.data()); + assert(x == F(0.25)); + } + } + { // No exponent + const char* s = "1.23"; + r = std::from_chars(s, s + strlen(s), x, std::chars_format::scientific); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { // Exponent no number + const char* s = "1.23e"; + r = std::from_chars(s, s + strlen(s), x, std::chars_format::scientific); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { // Exponent sign no number + { + const char* s = "1.5e+"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.5e-"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + } + { // Exponent with whitespace + { + const char* s = "1.5e +1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.5e+ 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.5e -1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.5e- 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + } + { // exponent double sign + { + const char* s = "1.25e++12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.25e+-12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.25e-+12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + { + const char* s = "1.25e--12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s); + assert(x == F(0.25)); + } + } + + // *** Success + + { // number followed by non-numeric values + const char* s = "001e0x"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 5); + assert(x == F(1.0)); + } + + { // double deciamal point + const char* s = "1.25e0.78"; + + // This number is halfway between two float values. + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.25)); + } + + { // exponenent no sign + const char* s = "1.5e10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.5e10)); + } + { // exponenent capitalized no sign + const char* s = "1.5E10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.5e10)); + } + { // exponenent + sign + const char* s = "1.5e+10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.5e10)); + } + { // exponenent - sign + const char* s = "1.5e-10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.5e-10)); + } + { // exponent hex prefix -> e0 + const char* s = "1.25e0x12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.25)); + } + { // double exponent + const char* s = "1.25e0e12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.25)); + } + { // This number is halfway between two float values. + const char* s = "20040229e0"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 10); + assert(x == F(20040229)); + } + { // Shifting mantissa exponent and an exponent + const char* s = "123.456e3"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 9); + assert(x == F(1.23456e5)); + } + { // Mantissa overflow + { + const char* s = "0.111111111111111111111111111111111111111111e0"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(0.111111111111111111111111111111111111111111)); + } + { + const char* s = "111111111111.111111111111111111111111111111111111111111e0"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(111111111111.111111111111111111111111111111111111111111)); + } + } + { // Negative value + const char* s = "-0.25e0"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(-0.25)); + } + { // value is too big -> +inf + const char* s = "1e9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == std::numeric_limits::infinity()); + } + { // negative value is too big -> -inf + const char* s = "-1e9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == -std::numeric_limits::infinity()); + } + { // value is too small -> 0 + const char* s = "1e-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(0.0)); + } + { // negative value is too small -> -0 + const char* s = "-1e-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::scientific); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(-0.0)); + } + } +}; + +template +struct test_general { + void operator()() { + std::from_chars_result r; + F x = 0.25; + + // *** Failures + + { // Starts with invalid character + std::array s = {' ', '1'}; + for (auto c : "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "`~!@#$%^&*()_=[]{}\\|;:'\",/<>? \t\v\r\n") { + s[0] = c; + r = std::from_chars(s.data(), s.data() + s.size(), x); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s.data()); + assert(x == F(0.25)); + } + } + + // *** Success + + { // number followed by non-numeric values + const char* s = "001x"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.0)); + } + { // no leading digit + const char* s = ".5e0"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0.5)); + } + { // negative sign and no leading digit + const char* s = "-.5e0"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 5); + assert(x == F(-0.5)); + } + { // no leading digit + const char* s = ".5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 2); + assert(x == F(0.5)); + } + { // negative sign and no leading digit + const char* s = "-.5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(-0.5)); + } + { // double deciamal point + const char* s = "1.25.78"; + + // This number is halfway between two float values. + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { // exponenent no sign + const char* s = "1.5e10"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.5e10)); + } + { // exponenent capitalized no sign + const char* s = "1.5E10"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.5e10)); + } + { // exponenent + sign + const char* s = "1.5e+10"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.5e10)); + } + { // exponenent - sign + const char* s = "1.5e-10"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.5e-10)); + } + { // Exponent no number + const char* s = "1.5e"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { // Exponent sign no number + { + const char* s = "1.5e+"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e-"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + } + { // Exponent with whitespace + { + const char* s = "1.5e +1"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e+ 1"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e -1"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + { + const char* s = "1.5e- 1"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.5)); + } + } + { // exponent double sign + { + const char* s = "1.25e++12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e+-12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e-+12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + { + const char* s = "1.25e--12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(1.25)); + } + } + { // exponent hex prefix -> e0 + const char* s = "1.25e0x12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.25)); + } + { // double exponent + const char* s = "1.25e0e12"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(1.25)); + } + { // This number is halfway between two float values. + const char* s = "20040229"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 8); + assert(x == F(20040229)); + } + { // Shifting mantissa exponent and no exponent + const char* s = "123.456"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(1.23456e2)); + } + { // Shifting mantissa exponent and an exponent + const char* s = "123.456e3"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 9); + assert(x == F(1.23456e5)); + } + { // Mantissa overflow + { + const char* s = "0.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(0.111111111111111111111111111111111111111111)); + } + { + const char* s = "111111111111.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(111111111111.111111111111111111111111111111111111111111)); + } + } + { // Negative value + const char* s = "-0.25"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(-0.25)); + } + { // value is too big -> +inf + const char* s = "1e9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == std::numeric_limits::infinity()); + } + { // negative value is too big -> -inf + const char* s = "-1e9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == -std::numeric_limits::infinity()); + } + { // value is too small -> 0 + const char* s = "1e-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(0.0)); + } + { // negative value is too small -> -0 + const char* s = "-1e-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(-0.0)); + } + } +}; + +template +struct test_hex { + void operator()() { + std::from_chars_result r; + F x = 0.25; + + // *** Failures + + { // Starts with invalid character + std::array s = {' ', '1', 'e', '0'}; + for (auto c : "ghijklmnopqrstuvwxyz" + "GHIJKLMNOPQRSTUVWXYZ" + "`~!@#$%^&*()_=[]{}\\|;:'\",/<>? \t\v\r\n") { + s[0] = c; + r = std::from_chars(s.data(), s.data() + s.size(), x, std::chars_format::hex); + + assert(r.ec == std::errc::invalid_argument); + assert(r.ptr == s.data()); + assert(x == F(0.25)); + } + } + + // *** Success + + { // number followed by non-numeric values + const char* s = "001x"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(1.0)); + } + { // no leading digit + const char* s = ".5p0"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x0.5p0)); + } + { // negative sign and no leading digit + const char* s = "-.5p0"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 5); + assert(x == F(-0x0.5p0)); + } + { // no leading digit + const char* s = ".5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 2); + assert(x == F(0x0.5p0)); + } + { // negative sign and no leading digit + const char* s = "-.5"; + + // the expected form of the subject sequence is a nonempty sequence of + // decimal digits optionally containing a decimal-point character, then + // an optional exponent part as defined in 6.4.4.3, excluding any digit + // separators (6.4.4.2); (C23 7.24.1.5) + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(-0x0.5p0)); + } + { // double deciamal point + const char* s = "1.25.78"; + + // This number is halfway between two float values. + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x1.25p0)); + } + { // exponenent no sign + const char* s = "1.5p10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(0x1.5p10)); + } + { // exponenent capitalized no sign + const char* s = "1.5P10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(0x1.5p10)); + } + { // exponenent + sign + const char* s = "1.5p+10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(0x1.5p10)); + } + { // exponenent - sign + const char* s = "1.5p-10"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(0x1.5p-10)); + } + { // Exponent no number + const char* s = "1.5p"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + { // Exponent sign no number + { + const char* s = "1.5p+"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + { + const char* s = "1.5p-"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + } + { // Exponent with whitespace + { + const char* s = "1.5p +1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + { + const char* s = "1.5p+ 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + { + const char* s = "1.5p -1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + { + const char* s = "1.5p- 1"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 3); + assert(x == F(0x1.5p0)); + } + } + { // Exponent double sign + { + const char* s = "1.25p++12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x1.25p0)); + } + { + const char* s = "1.25p+-12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x1.25p0)); + } + { + const char* s = "1.25p-+12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x1.25p0)); + } + { + const char* s = "1.25p--12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 4); + assert(x == F(0x1.25p0)); + } + } + { // exponent hex prefix -> p0 + const char* s = "1.25p0x12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(0x1.25p0)); + } + { // double exponent + const char* s = "1.25p0p12"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 6); + assert(x == F(0x1.25p0)); + } + { // This number is halfway between two float values. + const char* s = "131CA25"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(0x131CA25p0)); + } + { // Shifting mantissa exponent and no exponent + const char* s = "123.456"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 7); + assert(x == F(0x123.456p0)); + } + { // Shifting mantissa exponent and an exponent + const char* s = "123.456p3"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + 9); + assert(x == F(0x123.456p3)); + } + { // Mantissa overflow + { + const char* s = "0.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(0x0.111111111111111111111111111111111111111111p0)); + } + { + const char* s = "111111111111.111111111111111111111111111111111111111111"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(0x111111111111.111111111111111111111111111111111111111111p0)); + } + } + { // Negative value + const char* s = "-0.25"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc{}); + assert(r.ptr == s + std::strlen(s)); + assert(x == F(-0x0.25p0)); + } + { // value is too big -> +inf + const char* s = "1p9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == std::numeric_limits::infinity()); + } + { // negative value is too big -> -inf + const char* s = "-1p9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == -std::numeric_limits::infinity()); + } + { // value is too small -> 0 + const char* s = "1p-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(0.0)); + } + { // negative value is too small -> -0 + const char* s = "-1p-9999999999999999999999999999999999999999"; + + r = std::from_chars(s, s + std::strlen(s), x, std::chars_format::hex); + assert(r.ec == std::errc::result_out_of_range); + assert(r.ptr == s + strlen(s)); + assert(x == F(-0.0)); + } + } +}; + +// The test +// test/std/utilities/charconv/charconv.msvc/test.cpp +// uses random values. This tests contains errors found by this test. +void test_random_errors() { + { + const char* s = "4.219902180869891e-2788"; + const char* last = s + std::strlen(s) - 1; + + // last + 1 contains a digit. When that value is parsed the exponent is + // e-2788 which returns std::errc::result_out_of_range and the value 0. + // the proper exponent is e-278, which can be represented by a double. + + double value = 0.25; + std::from_chars_result result = std::from_chars(s, last, value); + + assert(result.ec == std::errc{}); + assert(result.ptr == last); + assert(value == 4.219902180869891e-278); + } + { + const char* s = "7.411412e-39U"; + const char* last = s + std::strlen(s) - 1; + + float value = 0.25; + std::from_chars_result result = std::from_chars(s, last, value); + + assert(result.ec == std::errc{}); + assert(result.ptr == last); + assert(value == 7.411412e-39F); + } +} + +int main(int, char**) { + run(all_floats); + run(all_floats); + run(all_floats); + run(all_floats); + + run(all_floats); + + test_random_errors(); + + return 0; +} diff --git a/libcxx/test/std/utilities/charconv/charconv.msvc/test.cpp b/libcxx/test/std/utilities/charconv/charconv.msvc/test.cpp index 30ee9adcd74b..ace6d46b879b 100644 --- a/libcxx/test/std/utilities/charconv/charconv.msvc/test.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.msvc/test.cpp @@ -45,6 +45,7 @@ #include "float_hex_precision_to_chars_test_cases.hpp" #include "float_scientific_precision_to_chars_test_cases.hpp" #include "float_to_chars_test_cases.hpp" +#include "floating_point_test_cases.hpp" using namespace std; @@ -589,8 +590,8 @@ void test_floating_prefix(const conditional_t(val) == bits, "round-trip", bits); -#endif +#endif // TEST_HAS_FROM_CHARS_FLOATING_POINT } { @@ -656,8 +656,8 @@ void test_floating_hex_prefix(const conditional_t(val) == bits, "(hex) round-trip", bits); -#endif +#endif // TEST_HAS_FROM_CHARS_FLOATING_POINT } } @@ -786,8 +786,7 @@ void test_floating_prefixes(mt19937_64& mt64) { } } -// TODO Enable once std::from_chars has floating point support. -#if 0 +#ifdef TEST_HAS_FROM_CHARS_FLOATING_POINT template void test_floating_from_chars(const chars_format fmt) { test_from_chars("", fmt, 0, inv_arg); // no characters @@ -855,11 +854,13 @@ void test_floating_from_chars(const chars_format fmt) { // The UCRT considers indeterminate NaN to be negative quiet NaN with no payload bits set. // It parses "nan(ind)" and "-nan(ind)" identically. +# ifdef _MSC_VER test_from_chars("nan(InD)", fmt, 8, errc{}, -qnan); test_from_chars("-nan(InD)", fmt, 9, errc{}, -qnan); test_from_chars("nan(SnAn)", fmt, 9, errc{}, nullopt, TestFromCharsMode::SignalingNaN); test_from_chars("-nan(SnAn)", fmt, 10, errc{}, nullopt, TestFromCharsMode::SignalingNaN); +# endif switch (fmt) { case chars_format::general: @@ -941,7 +942,7 @@ void test_floating_from_chars(const chars_format fmt) { break; } } -#endif +#endif // TEST_HAS_FROM_CHARS_FLOATING_POINT template void test_floating_to_chars( @@ -953,13 +954,11 @@ void test_floating_to_chars( void all_floating_tests(mt19937_64& mt64) { test_floating_prefixes(mt64); -// TODO Enable once std::from_chars has floating point support. -#if 0 +#ifdef TEST_HAS_FROM_CHARS_FLOATING_POINT for (const auto& fmt : {chars_format::general, chars_format::scientific, chars_format::fixed, chars_format::hex}) { test_floating_from_chars(fmt); test_floating_from_chars(fmt); } - // Test rounding. // See float_from_chars_test_cases.hpp in this directory. @@ -993,7 +992,8 @@ void all_floating_tests(mt19937_64& mt64) { for (const auto& p : floating_point_test_cases_double) { test_from_chars(p.first, chars_format::general, strlen(p.first), errc{}, _Bit_cast(p.second)); } -#endif +#endif // TEST_HAS_FROM_CHARS_FLOATING_POINT + // See float_to_chars_test_cases.hpp in this directory. for (const auto& t : float_to_chars_test_cases) { if (t.fmt == chars_format{}) { diff --git a/libcxx/test/std/utilities/charconv/charconv.msvc/test.pass.cpp b/libcxx/test/std/utilities/charconv/charconv.msvc/test.pass.cpp index 09ef70ea9924..c294a40ce71c 100644 --- a/libcxx/test/std/utilities/charconv/charconv.msvc/test.pass.cpp +++ b/libcxx/test/std/utilities/charconv/charconv.msvc/test.pass.cpp @@ -8,6 +8,9 @@ // UNSUPPORTED: c++03, c++11, c++14 +// TODO Investigate why this fails +// UNSUPPORTED: windows + // to_chars requires functions in the dylib that have not been introduced in older // versions of the dylib on macOS. // XFAIL: availability-fp_to_chars-missing @@ -22,6 +25,7 @@ // #include +#include "test_macros.h" // Work-around for sprintf_s's usage in the Microsoft tests. #ifndef _WIN32 diff --git a/libcxx/test/support/charconv_test_helpers.h b/libcxx/test/support/charconv_test_helpers.h index f5fbedbeb0dc..fcae09478457 100644 --- a/libcxx/test/support/charconv_test_helpers.h +++ b/libcxx/test/support/charconv_test_helpers.h @@ -317,6 +317,8 @@ auto all_unsigned = type_list< >(); auto integrals = concat(all_signed, all_unsigned); +auto all_floats = type_list< float, double >(); //TODO: Add long double + template